def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def __call__(self, es): mask = es.mask # first layer forward_es = self.forward_layers[0](es) rev_backward_es = self.backward_layers[0]( ReversedExpressionSequence(es)) for layer_i in range(1, len(self.forward_layers)): new_forward_es = self.forward_layers[layer_i]( [forward_es, ReversedExpressionSequence(rev_backward_es)]) rev_backward_es = ExpressionSequence(self.backward_layers[layer_i]( [ReversedExpressionSequence(forward_es), rev_backward_es]).as_list(), mask=mask) forward_es = new_forward_es self._final_states = [FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(), self.backward_layers[layer_i].get_final_states()[0].main_expr()]), dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(), self.backward_layers[layer_i].get_final_states()[0].cell_expr()])) \ for layer_i in range(len(self.forward_layers))] return ExpressionSequence(expr_list=[ dy.concatenate([forward_es[i], rev_backward_es[-i - 1]]) for i in range(len(forward_es)) ], mask=mask)
def transduce(self, sent: ExpressionSequence) -> ExpressionSequence: if self.pos_encoding_type == "trigonometric": if self.position_encoding_block is None or self.position_encoding_block.shape[ 2] < len(sent): self.initialize_position_encoding( int(len(sent) * 1.2), self.input_dim if self.pos_encoding_combine == "add" else self.pos_encoding_size) encoding = dy.inputTensor( self.position_encoding_block[0, :, :len(sent)]) elif self.pos_encoding_type == "embedding": encoding = self.positional_embedder.embed_sent( len(sent)).as_tensor() if self.pos_encoding_type: if self.pos_encoding_combine == "add": sent = ExpressionSequence(expr_tensor=sent.as_tensor() + encoding, mask=sent.mask) else: # concat sent = ExpressionSequence(expr_tensor=dy.concatenate( [sent.as_tensor(), encoding]), mask=sent.mask) elif self.pos_encoding_type: raise ValueError(f"unknown encoding type {self.pos_encoding_type}") for module in self.modules: enc_sent = module.transduce(sent) sent = enc_sent self._final_states = [transducer.FinalTransducerState(sent[-1])] return sent
def transduce(self, es): """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one. Args: es: a list of Expression see also add_inputs(xs), including for explanation of differences between add_inputs and this function. """ es = self.builder_layers[0](es) self._final_states = [self.builder_layers[0].get_final_states()[0]] if len(self.builder_layers) == 1: return es for l in self.builder_layers[1:]: es = ExpressionSequence(expr_list=self._sum_lists(l(es), es)) self._final_states.append(FinalTransducerState(es[-1], l.get_final_states()[0].cell_expr())) last_output = self.builder_layers[-1](es) if self.add_to_output: self._final_states.append(FinalTransducerState(last_output[-1], self.builder_layers[-1].get_final_states()[0].cell_expr())) return ExpressionSequence(expr_list=self._sum_lists(last_output, es)) else: self._final_states.append(self.builder_layers[-1].get_final_states()[0]) return last_output
def transduce(self, embed_sent: ExpressionSequence) -> ExpressionSequence: src = embed_sent.as_tensor() sent_len = src.dim()[0][1] batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, seq: ExpressionSequence) -> ExpressionSequence: seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor() if self.layer_norm: d = seq_tensor.dim() seq_tensor = dy.reshape(seq_tensor, (d[0][0],), batch_size=d[0][1]*d[1]) seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b) seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1]) return ExpressionSequence(expr_tensor=seq_tensor)
def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence: """ transduce the sequence Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ Wq, Wk, Wv, Wo = [ dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo) ] bq, bk, bv, bo = [ dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo) ] # Start with a [(length, model_size) x batch] tensor x = expr_seq.as_transposed_tensor() x_len = x.dim()[0][0] x_batch = x.dim()[1] # Get the query key and value vectors # TODO: do we need bias broadcasting in DyNet? # q = dy.affine_transform([bq, x, Wq]) # k = dy.affine_transform([bk, x, Wk]) # v = dy.affine_transform([bv, x, Wv]) q = bq + x * Wq k = bk + x * Wk v = bv + x * Wv # Split to batches [(length, head_dim) x batch * num_heads] tensor q, k, v = [ dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q, k, v) ] # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys attn_score = q * dy.transpose(k) / sqrt(self.head_dim) if expr_seq.mask is not None: mask = dy.inputTensor(np.repeat( expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10 attn_score = attn_score + mask attn_prob = dy.softmax(attn_score, d=1) # Reduce using attention and resize to match [(length, model_size) x batch] o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch) # Final transformation # o = dy.affine_transform([bo, attn_prob * v, Wo]) o = bo + o * Wo expr_seq = ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask) self._final_states = [FinalTransducerState(expr_seq[-1], None)] return expr_seq
def exprseq_pooling(self, exprseq): # Reduce to vector exprseq = ExpressionSequence(expr_tensor=exprseq.mask.add_to_tensor_expr(exprseq.as_tensor(),-1e10), mask=exprseq.mask) if exprseq.expr_tensor != None: if len(exprseq.expr_tensor.dim()[0]) > 1: return dy.max_dim(exprseq.expr_tensor, d=1) else: return exprseq.expr_tensor else: return dy.emax(exprseq.expr_list)
def transduce(self, x: ExpressionSequence) -> ExpressionSequence: x_T = x.as_transposed_tensor() scores = x_T * dy.parameter(self.W) if x.mask is not None: scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True) if self.pos_enc_max: seq_len = x_T.dim()[0][0] pos_enc = self.pos_enc[:seq_len,:] scores = dy.cmult(scores, dy.inputTensor(pos_enc)) attention = dy.softmax(scores) output_expr = x.as_tensor() * attention return expression_sequence.ExpressionSequence(expr_tensor=output_expr, mask=None)
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: sent_len = len(src) embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len]) if self.op == 'sum': output = embeddings + src.as_tensor() elif self.op == 'concat': output = dy.concatenate([embeddings, src.as_tensor()]) else: raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")') output_seq = ExpressionSequence(expr_tensor=output, mask=src.mask) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = 1 batch_size = src.dim()[1] W = dy.parameter(self.pW) b = dy.parameter(self.pb) src = dy.reshape(src, (src_height, src_width), batch_size=batch_size) # ((276, 80, 3), 1) # convolution and pooling layers l1 = (W*src)+b output = dy.cdiv(l1,dy.sqrt(dy.squared_norm(l1))) return ExpressionSequence(expr_tensor=output)
def embed_sent(self, sent): """Embed a full sentence worth of words. By default, just do a for loop. Args: sent: This will generally be a list of word IDs, but could also be a list of strings or some other format. It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object Returns: xnmt.expression_sequence.ExpressionSequence: An expression sequence representing vectors of each word in the input. """ # single mode if not xnmt.batcher.is_batched(sent): embeddings = [self.embed(word) for word in sent] # minibatch mode else: embeddings = [] seq_len = len(sent[0]) for single_sent in sent: assert len(single_sent) == seq_len for word_i in range(seq_len): batch = xnmt.batcher.mark_as_batch( [single_sent[word_i] for single_sent in sent]) embeddings.append(self.embed(batch)) return ExpressionSequence( expr_list=embeddings, mask=sent.mask if xnmt.batcher.is_batched(sent) else None)
def __call__(self, expr_seq): """ transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c) Args: expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated) Returns: expression sequence """ if isinstance(expr_seq, ExpressionSequence): expr_seq = [expr_seq] batch_size = expr_seq[0][0].dim()[1] seq_len = len(expr_seq[0]) if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) cur_input = expr_seq self._final_states = [] for layer_i in range(self.num_layers): h = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] c = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)] for pos_i in range(seq_len): x_t = [cur_input[j][pos_i] for j in range(len(cur_input))] if isinstance(x_t, dy.Expression): x_t = [x_t] elif type(x_t) != list: x_t = list(x_t) if self.dropout_rate > 0.0 and self.train: # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights) gates_t = dy.vanilla_lstm_gates_dropout_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.dropout_mask_x[layer_i], self.dropout_mask_h[layer_i], self.weightnoise_std if self.train else 0.0) else: gates_t = dy.vanilla_lstm_gates_concat( x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0) c_t = dy.vanilla_lstm_c(c[-1], gates_t) h_t = dy.vanilla_lstm_h(c_t, gates_t) if expr_seq[0].mask is None or np.isclose( np.sum(expr_seq[0].mask.np_arr[:, pos_i:pos_i + 1]), 0.0): c.append(c_t) h.append(h_t) else: c.append(expr_seq[0].mask.cmult_by_timestep_expr( c_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( c[-1], pos_i, False)) h.append(expr_seq[0].mask.cmult_by_timestep_expr( h_t, pos_i, True) + expr_seq[0].mask.cmult_by_timestep_expr( h[-1], pos_i, False)) self._final_states.append(FinalTransducerState(h[-1], c[-1])) cur_input = [h[1:]] return ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
def transduce(self, embeds): expr_seq = [] seq_len = embeds.dim()[0][1] for i in range(seq_len): expr_seq.append(dy.max_dim(dy.select_cols(embeds, [i]), 1)) encodings = self.seq_transducer.transduce(ExpressionSequence(expr_seq)) return self.seq_transducer.get_final_states()[-1].main_expr()
def __call__(self, es): """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. :param es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if self.downsampling_method=="concat" and len(es_list[0]) % reduce_factor != 0: raise ValueError("For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor. Configure batcher accordingly.") fs = fb(es_list) bs = bb([ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method=="skip": es_list = [ExpressionSequence(expr_list=fs[::reduce_factor]), ExpressionSequence(expr_list=bs[::reduce_factor][::-1])] elif self.downsampling_method=="concat": es_len = len(es_list[0]) es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i==0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i+j]) es_list_bwd[j].append(bs[len(es_list[0])-reduce_factor+j-i]) es_list = [ExpressionSequence(expr_list=es_list_fwd[j]) for j in range(reduce_factor)] + [ExpressionSequence(expr_list=es_list_bwd[j]) for j in range(reduce_factor)] else: raise RuntimeError("unknown downsampling_method %s" % self.downsampling_method) else: # concat final outputs ret_es = ExpressionSequence(expr_list=[dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs))]) self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), dy.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def __call__(self, src): src = src.as_tensor() # convolutional layer src = padding(src, src.dim()[0][0], src.dim()[0][1], self.filter_width, self.stride, src.dim()[1]) l1 = dy.rectify( dy.conv2d(src, dy.parameter(self.filter_conv), stride=[self.stride, self.stride], is_valid=True)) timestep = l1.dim()[0][1] features = l1.dim()[0][2] batch_size = l1.dim()[1] # transpose l1 to be (timesetp, dim), but keep the batch_size. rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size) rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)] for l in range(self.rhn_num_hidden_layers): rhn_out = [] # initialize a random vector for the first state vector, keep the same batch size. prev_state = dy.parameter(self.init[l]) # begin recurrent high way network for t in range(timestep): for m in range(0, self.rhn_microsteps): H = dy.affine_transform([ dy.parameter(self.recur[l][m][1]), dy.parameter(self.recur[l][m][0]), prev_state ]) T = dy.affine_transform([ dy.parameter(self.recur[l][m][3]), dy.parameter(self.recur[l][m][2]), prev_state ]) if m == 0: H += dy.parameter(self.linear[l][0]) * rhn_in[t] T += dy.parameter(self.linear[l][1]) * rhn_in[t] H = dy.tanh(H) T = dy.logistic(T) prev_state = dy.cmult(1 - T, prev_state) + dy.cmult( T, H) # ((1024, ), batch_size) rhn_out.append(prev_state) if self.residual and l > 0: rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)] rhn_in = rhn_out # Compute the attention-weighted average of the activations rhn_in = dy.concatenate_cols(rhn_in) scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh( dy.parameter(self.attention[0][0]) * rhn_in) # ((1,510), batch_size) scores = dy.reshape(scores, (scores.dim()[0][1], ), batch_size=scores.dim()[1]) attn_out = rhn_in * dy.softmax( scores ) # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size) return ExpressionSequence(expr_tensor=attn_out)
def transduce(self, es): forward_e = self.forward_layer(es) backward_e = self.backward_layer(ReversedExpressionSequence(es)) self._final_states = [FinalTransducerState(dy.concatenate([self.forward_layer.get_final_states()[0].main_expr(), self.backward_layer.get_final_states()[0].main_expr()]), dy.concatenate([self.forward_layer.get_final_states()[0].cell_expr(), self.backward_layer.get_final_states()[0].cell_expr()]))] output = self.residual_network.transduce(ExpressionSequence(expr_list=[dy.concatenate([f,b]) for f,b in zip(forward_e, ReversedExpressionSequence(backward_e))])) self._final_states += self.residual_network.get_final_states() return output
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) sample_size = len(actions) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations for j, sample in enumerate(actions): lower_bound = 0 # Read every 'segment' decision for k, upper_bound in enumerate(sample[i]): char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (dy.pick_range(sequence, lower_bound, upper_bound + 1, 1), j, i, k, lower_bound, upper_bound + 1)) #self.segment_composer.set_word_boundary(lower_bound, upper_bound, self.src_sent[i]) #composed = self.segment_composer.transduce(char_sequence) #outputs[j][i].append(composed) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, sample_size, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [[ len(outputs[i][j]) for j in range(batch_size) ] for i in range(sample_size)] enc_outputs = [] for batched_sampled_sentence in outputs: sampled_sentence, segment_mask = self.pad( batched_sampled_sentence) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) sent_context = self.final_transducer.transduce(expr_seq) self.final_states.append( self.final_transducer.get_final_states()) enc_outputs.append(sent_context) return CompoundSeqExpression(enc_outputs) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.compute_report: self.add_sent_for_report({"segment_actions": actions})
def transduce(self, x: ExpressionSequence) -> ExpressionSequence: seq_len = len(x) batch_size = x[0].dim()[1] att_mask = None if self.diagonal_mask_width is not None: if self.diagonal_mask_width is None: att_mask = np.zeros((seq_len, seq_len)) else: att_mask = np.ones((seq_len, seq_len)) for i in range(seq_len): from_i = max(0, i - self.diagonal_mask_width // 2) to_i = min(seq_len, i + self.diagonal_mask_width // 2 + 1) att_mask[from_i:to_i, from_i:to_i] = 0.0 mid = self.self_attn(x=x, att_mask=att_mask, batch_mask=x.mask.np_arr if x.mask else None, p=self.dropout) if self.downsample_factor > 1: seq_len = int(math.ceil(seq_len / float(self.downsample_factor))) hidden_dim = mid.dim()[0][0] out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) if self.ff_lstm: mid_re = dy.reshape(mid, (hidden_dim, seq_len), batch_size=batch_size) out = self.feed_forward.transduce( ExpressionSequence(expr_tensor=mid_re, mask=out_mask)) out = dy.reshape(out.as_tensor(), (hidden_dim, ), batch_size=seq_len * batch_size) else: out = self.feed_forward.transduce(mid, p=self.dropout) self._recent_output = out return ExpressionSequence(expr_tensor=dy.reshape( out, (out.dim()[0][0], seq_len), batch_size=batch_size), mask=out_mask)
def embed_sent(self, sent): # single mode if not xnmt.batcher.is_batched(sent): embeddings = [self.embed(word) for word in sent] # minibatch mode else: embeddings = [] seq_len = len(sent[0]) for single_sent in sent: assert len(single_sent)==seq_len for word_i in range(seq_len): batch = xnmt.batcher.mark_as_batch([single_sent[word_i] for single_sent in sent]) embeddings.append(self.embed(batch)) return ExpressionSequence(expr_list=embeddings, mask=sent.mask if xnmt.batcher.is_batched(sent) else None)
def embed_sent(self, sent): # TODO refactor: seems a bit too many special cases that need to be distinguished batched = xnmt.batcher.is_batched(sent) first_sent = sent[0] if batched else sent if hasattr(first_sent, "get_array"): if not batched: return LazyNumpyExpressionSequence(lazy_data=sent.get_array()) else: return LazyNumpyExpressionSequence(lazy_data=xnmt.batcher.mark_as_batch( [s for s in sent]), mask=sent.mask) else: if not batched: embeddings = [self.embed(word) for word in sent] else: embeddings = [] for word_i in range(sent.sent_len()): embeddings.append(self.embed(xnmt.batcher.mark_as_batch([single_sent[word_i] for single_sent in sent]))) return ExpressionSequence(expr_list=embeddings, mask=sent.mask)
def transduce(self, embed_sent): src = embed_sent.as_tensor() W = dy.parameter(self.pW) b = dy.parameter(self.pb) l1 = dy.affine_transform([b, W, src]) output = l1 if self.nonlinearity is 'linear': output = l1 elif self.nonlinearity is 'sigmoid': output = dy.logistic(l1) elif self.nonlinearity is 'tanh': output = 2 * dy.logistic(l1) - 1 elif self.nonlinearity is 'relu': output = dy.rectify(l1) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def __call__(self, es): """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. :param es: an ExpressionSequence """ es_list = [es] zero_pad = None batch_size = es_list[0][0].dim()[1] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) while self.downsampling_method == "concat" and len( es_list[0]) % reduce_factor != 0: for es_i in range(len(es_list)): expr_list = es_list[es_i].as_list() if zero_pad is None or zero_pad.dim( )[0][0] != expr_list[0].dim()[0][0]: zero_pad = dy.zeros(dim=expr_list[0].dim()[0][0], batch_size=batch_size) expr_list.append(zero_pad) es_list[es_i] = ExpressionSequence(expr_list=expr_list) fs = fb(es_list) bs = bb( [ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method == "skip": es_list = [ ExpressionSequence(expr_list=fs[::reduce_factor]), ExpressionSequence(expr_list=bs[::reduce_factor][::-1]) ] elif self.downsampling_method == "concat": es_len = len(es_list[0]) es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i == 0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i + j]) es_list_bwd[j].append(bs[len(es_list[0]) - reduce_factor + j - i]) es_list = [ ExpressionSequence(expr_list=es_list_fwd[j]) for j in range(reduce_factor) ] + [ ExpressionSequence(expr_list=es_list_bwd[j]) for j in range(reduce_factor) ] else: raise RuntimeError("unknown downsampling_method %s" % self.downsampling_method) else: # concat final outputs ret_es = ExpressionSequence(expr_list=[ dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs)) ]) self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), dy.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def __call__(self, es, transitions): mask = es.mask #import pdb;pdb.set_trace() transitions = [t + [0, 1] for t in transitions] transitions = np.array(transitions) maxlen = max(len(r) for r in transitions) Wl = dy.parameter(self.p_Wl) Wr = dy.parameter(self.p_Wr) b = dy.parameter(self.p_b) batch_size = len(transitions) ha = [] c = [] self.hfinals = [] hfinal_state = None cfinal_state = None self.cfinals = [] for i in range(batch_size): hstack = [] cstack = [] htmp = [] count = 0 for j in range(len(transitions[i])): if transitions[i][j] == 0: #print("Shift") #shift onto stack e1 = dy.reshape(es[count], (batch_size, self.hidden_dim))[i] count += 1 hstack.append(e1) cstack.append(e1) elif transitions[i][j] == 1: #reduce #print("Reduce") h1 = hstack.pop() h2 = hstack.pop() c1 = cstack.pop() c2 = cstack.pop() tmp = dy.affine_transform([b, Wl, h1, Wr, h2]) i_gate = dy.pick_range(tmp, 0, self.hidden_dim) fl_gate = dy.pick_range(tmp, self.hidden_dim, self.hidden_dim * 2) fr_gate = dy.pick_range(tmp, self.hidden_dim * 2, self.hidden_dim * 3) o_gate = dy.pick_range(tmp, self.hidden_dim * 3, self.hidden_dim * 4) cell_inp = dy.pick_range(tmp, self.hidden_dim * 4, self.hidden_dim * 5) i_gate = dy.tanh(i_gate) cell_inp = dy.logistic(cell_inp) fl_gate = dy.logistic(fl_gate) fr_gate = dy.logistic(fr_gate) o_gate = dy.logistic(o_gate) c_t = dy.cmult(fl_gate, c1) + dy.cmult( fr_gate, c2) + dy.cmult(i_gate, cell_inp) h_t = dy.cmult(o_gate, dy.tanh(c_t)) cstack.append(c_t) hstack.append(h_t) htmp.append(h_t) hfinal_state = h_t cfinal_state = c_t else: htmp.append(dy.zeros(self.hidden_dim)) self.hfinals.append(h_t) self.cfinals.append(c_t) ha.append(htmp) self._final_states = [ FinalTransducerState(dy.concatenate_to_batch(self.hfinals), dy.concatenate_to_batch(self.cfinals)) ] ha = list(zip_longest(*ha)) hh = [] for x in ha: hh.append(list(x)) k = [ dy.reshape(dy.concatenate(xx), (xx[0].dim()[0][0], len(xx))) for xx in hh ] return ExpressionSequence(expr_list=k)
def __call__(self, x: dy.Expression, att_mask: np.ndarray, batch_mask: np.ndarray, p: float): """ x: expression of dimensions (input_dim, time) x batch att_mask: numpy array of dimensions (time, time); pre-transposed batch_mask: numpy array of dimensions (batch, time) p: dropout prob """ sent_len = x.dim()[0][1] batch_size = x[0].dim()[1] if self.downsample_factor > 1: if sent_len % self.downsample_factor != 0: raise ValueError( "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. " "Configure batcher accordingly.") if batch_mask is not None: batch_mask = batch_mask[:, ::self.downsample_factor] sent_len_out = sent_len // self.downsample_factor sent_len = sent_len_out out_mask = x.mask if self.downsample_factor > 1 and out_mask is not None: out_mask = out_mask.lin_subsampled( reduce_factor=self.downsample_factor) x = ExpressionSequence(expr_tensor=dy.reshape( x.as_tensor(), (x.dim()[0][0] * self.downsample_factor, x.dim()[0][1] / self.downsample_factor), batch_size=batch_size), mask=out_mask) residual = SAAMTimeDistributed()(x) else: residual = SAAMTimeDistributed()(x) sent_len_out = sent_len if self.model_dim != self.input_dim * self.downsample_factor: residual = self.res_shortcut(residual) # Concatenate all the words together for doing vectorized affine transform if self.kq_pos_encoding_type is None: kvq_lin = self.linear_kvq(SAAMTimeDistributed()(x)) key_up = self.shape_projection( dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head), batch_size) value_up = self.shape_projection( dy.pick_range(kvq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head, 3 * self.head_count * self.dim_per_head), batch_size) else: assert self.kq_pos_encoding_type == "embedding" encoding = self.kq_positional_embedder.embed_sent( sent_len).as_tensor() kq_lin = self.linear_kq(SAAMTimeDistributed()(ExpressionSequence( expr_tensor=dy.concatenate([x.as_tensor(), encoding])))) key_up = self.shape_projection( dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head), batch_size) query_up = self.shape_projection( dy.pick_range(kq_lin, self.head_count * self.dim_per_head, 2 * self.head_count * self.dim_per_head), batch_size) v_lin = self.linear_v(SAAMTimeDistributed()(x)) value_up = self.shape_projection(v_lin, batch_size) if self.cross_pos_encoding_type: assert self.cross_pos_encoding_type == "embedding" emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0, sent_len) emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0, sent_len) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) key_up = dy.concatenate_cols( [dy.cmult(key_up, emb1), dy.cmult(key_up, emb2)]) key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) query_up = dy.reshape( query_up, (sent_len, self.dim_per_head, self.head_count), batch_size=batch_size) query_up = dy.concatenate_cols( [dy.cmult(query_up, emb2), dy.cmult(query_up, -emb1)]) query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2), batch_size=self.head_count * batch_size) scaled = query_up * dy.transpose( key_up / math.sqrt(self.dim_per_head) ) # scale before the matrix multiplication to save memory # Apply Mask here if not self.ignore_masks: if att_mask is not None: att_mask_inp = att_mask * -100.0 if self.downsample_factor > 1: att_mask_inp = att_mask_inp[::self.downsample_factor, :: self.downsample_factor] scaled += dy.inputTensor(att_mask_inp) if batch_mask is not None: # reshape (batch, time) -> (time, head_count*batch), then *-100 inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :], (sent_len, self.head_count, batch_size)), (1, sent_len, self.head_count * batch_size)) \ * -100 mask_expr = dy.inputTensor(inp, batched=True) scaled += mask_expr if self.diag_gauss_mask: diag_growing = np.zeros((sent_len, sent_len, self.head_count)) for i in range(sent_len): for j in range(sent_len): diag_growing[i, j, :] = -(i - j)**2 / 2.0 e_diag_gauss_mask = dy.inputTensor(diag_growing) e_sigma = dy.parameter(self.diag_gauss_mask_sigma) if self.square_mask_std: e_sigma = dy.square(e_sigma) e_sigma_sq_inv = dy.cdiv( dy.ones(e_sigma.dim()[0], batch_size=batch_size), dy.square(e_sigma)) e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask, e_sigma_sq_inv) scaled += dy.reshape(e_diag_gauss_mask_final, (sent_len, sent_len), batch_size=batch_size * self.head_count) # Computing Softmax here. attn = dy.softmax(scaled, d=1) if LOG_ATTENTION: yaml_logger.info({ "key": "selfatt_mat_ax0", "value": np.average(attn.value(), axis=0).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1", "value": np.average(attn.value(), axis=1).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax0_ent", "value": entropy(attn.value()).dumps(), "desc": self.desc }) yaml_logger.info({ "key": "selfatt_mat_ax1_ent", "value": entropy(attn.value().transpose()).dumps(), "desc": self.desc }) self.select_att_head = 0 if self.select_att_head is not None: attn = dy.reshape(attn, (sent_len, sent_len, self.head_count), batch_size=batch_size) sel_mask = np.zeros((1, 1, self.head_count)) sel_mask[0, 0, self.select_att_head] = 1.0 attn = dy.cmult(attn, dy.inputTensor(sel_mask)) attn = dy.reshape(attn, (sent_len, sent_len), batch_size=self.head_count * batch_size) # Applying dropout to attention if p > 0.0: drop_attn = dy.dropout(attn, p) else: drop_attn = attn # Computing weighted attention score attn_prod = drop_attn * value_up # Reshaping the attn_prod to input query dimensions out = dy.reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), batch_size=batch_size) out = dy.transpose(out) out = dy.reshape(out, (self.model_dim, ), batch_size=batch_size * sent_len_out) # out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out) if self.plot_attention: from sklearn.metrics.pairwise import cosine_similarity assert batch_size == 1 mats = [] for i in range(attn.dim()[1]): mats.append(dy.pick_batch_elem(attn, i).npvalue()) self.plot_att_mat( mats[-1], "{}.sent_{}.head_{}.png".format( self.plot_attention, self.plot_attention_counter, i), 300) avg_mat = np.average(mats, axis=0) self.plot_att_mat( avg_mat, "{}.sent_{}.head_avg.png".format(self.plot_attention, self.plot_attention_counter), 300) cosim_before = cosine_similarity(x.as_tensor().npvalue().T) self.plot_att_mat( cosim_before, "{}.sent_{}.cosim_before.png".format( self.plot_attention, self.plot_attention_counter), 600) cosim_after = cosine_similarity(out.npvalue().T) self.plot_att_mat( cosim_after, "{}.sent_{}.cosim_after.png".format( self.plot_attention, self.plot_attention_counter), 600) self.plot_attention_counter += 1 # Adding dropout and layer normalization if p > 0.0: res = dy.dropout(out, p) + residual else: res = out + residual ret = self.layer_norm(res) return ret
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: output = self.transform(src.as_tensor()) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def __call__(self, output): if not isinstance(output, ExpressionSequence): output = ExpressionSequence(expr_list=output) return output
def transduce(self, es: ExpressionSequence) -> ExpressionSequence: """ returns the list of output Expressions obtained by adding the given inputs to the current state, one by one, to both the forward and backward RNNs, and concatenating. Args: es: an ExpressionSequence """ es_list = [es] for layer_i, (fb, bb) in enumerate(self.builder_layers): reduce_factor = self._reduce_factor_for_layer(layer_i) if es_list[0].mask is None: mask_out = None else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor) if self.downsampling_method == "concat" and len( es_list[0]) % reduce_factor != 0: raise ValueError( f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, " f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. " f"Set Batcher's pad_src_to_multiple argument accordingly.") fs = fb.transduce(es_list) bs = bb.transduce( [ReversedExpressionSequence(es_item) for es_item in es_list]) if layer_i < len(self.builder_layers) - 1: if self.downsampling_method == "skip": es_list = [ ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out), ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out) ] elif self.downsampling_method == "concat": es_len = len(es_list[0]) es_list_fwd = [] es_list_bwd = [] for i in range(0, es_len, reduce_factor): for j in range(reduce_factor): if i == 0: es_list_fwd.append([]) es_list_bwd.append([]) es_list_fwd[j].append(fs[i + j]) es_list_bwd[j].append(bs[len(es_list[0]) - reduce_factor + j - i]) es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \ [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)] else: raise RuntimeError( f"unknown downsampling_method {self.downsampling_method}" ) else: # concat final outputs ret_es = ExpressionSequence(expr_list=[ dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs)) ], mask=mask_out) self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(), bb.get_final_states()[0].main_expr()]), dy.concatenate([fb.get_final_states()[0].cell_expr(), bb.get_final_states()[0].cell_expr()])) \ for (fb, bb) in self.builder_layers] return ret_es
def __call__(self, sent): output = self.builder.transduce(sent) if not isinstance(output, ExpressionSequence): output = ExpressionSequence(expr_list=output) self._final_states = self.builder.get_final_states() return output
def embed_sent(self, sent_len): embeddings = dy.strided_select(dy.parameter(self.embeddings), [1,1], [0,0], [self.emb_dim, sent_len]) return ExpressionSequence(expr_tensor=embeddings, mask=None)