Example #1
    def _forward(self, emissions):
        """Viterbi forward to calculate all path scores.

        :param emissions: List[dy.Expression]

            dy.Expression ((1,), B)
        init_alphas = [-1e4] * self.n_tags
        init_alphas[self.start_idx] = 0

        alphas = dy.inputVector(init_alphas)
        transitions = self.transitions
        # len(emissions) == T
        for emission in emissions:
            add_emission = dy.colwise_add(transitions, emission)
            scores = dy.colwise_add(dy.transpose(add_emission), alphas)
            # dy.logsumexp takes a list of dy.Expression and computes logsumexp
            # elementwise across the lists so for example the logsumexp is calculated
            # for [0] in each list. This means we want the scores for a given
            # transition scores for a tag to be in the columns
            alphas = dy.logsumexp([x for x in scores])
        last_alpha = alphas + dy.pick(transitions, self.end_idx)
        alpha = dy.logsumexp([x for x in last_alpha])
        return alpha
Example #2
    def calc_attention(self, state):
        V = dy.parameter(self.pV)
        U = dy.parameter(self.pU)

        WI = self.WI
        curr_sent_mask = self.curr_sent.mask
        if self.attention_vecs:
            conv_feats = dy.conv2d(self.attention_vecs[-1],
                                   stride=[1, 1],
            conv_feats = dy.transpose(
                           (conv_feats.dim()[0][0], self.hidden_dim),
            h = dy.tanh(dy.colwise_add(WI + conv_feats, V * state))
            h = dy.tanh(dy.colwise_add(WI, V * state))
        scores = dy.transpose(U * h)
        if curr_sent_mask is not None:
            scores = curr_sent_mask.add_to_tensor_expr(scores,
        normalized = dy.softmax(scores)
        return normalized
def softmax(x):
    Compute the softmax function in tensorflow.

    You might find the tensorflow functions tf.exp, tf.reduce_max,
    tf.reduce_sum, tf.expand_dims useful. (Many solutions are possible, so you may
    not need to use all of these functions). Recall also that many common
    tensorflow operations are sugared (e.g. x * y does a tensor multiplication
    if x and y are both tensors). Make sure to implement the numerical stability
    fixes as in the previous homework!

        x:   tf.Tensor with shape (n_samples, n_features). Note feature vectors are
                  represented by row-vectors. (For simplicity, no need to handle 1-d
                  input as in the previous homework)
        out: tf.Tensor with shape (n_sample, n_features). You need to construct this
                  tensor in this problem.

    x_max = dy.max_dim(x, 1)
    x_sub = dy.colwise_add(x, -x_max)
    x_exp = dy.exp(x_sub)
    sum_exp = dy.colwise_add(dy.zeroes(x.dim()[0]), dy.sum_cols(x_exp))

    out = dy.cdiv(x_exp, sum_exp)

    return out
Example #4
    def hier_attend(self, context_pre, context_pos, state):
        w2 = dy.parameter(self.hier_w2)
        v = dy.parameter(self.hier_v)

        w2dt = w2 * dy.concatenate(list(state.s()))

        # context_pre
        w1_pre = dy.parameter(self.hier_w1_pre)
        w1dt_pre = w1_pre * context_pre
        energy_pre = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pre, w2dt)))

        w_pre = dy.parameter(self.hier_w_pre)
        wdt_pre = w_pre * context_pre

        # context_pos
        w1_pos = dy.parameter(self.hier_w1_pos)
        w1dt_pos = w1_pos * context_pos
        energy_pos = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt_pos, w2dt)))

        w_pos = dy.parameter(self.hier_w_pos)
        wdt_pos = w_pos * context_pos

        beta = dy.softmax(dy.concatenate([energy_pre, energy_pos]))
        wdt = dy.concatenate_cols([wdt_pre, wdt_pos])
        context = wdt * beta
        return context
Example #5
    def _forward(self, emissions):

        """Viterbi forward to calculate all path scores.

        :param emissions: List[dy.Expression]

            dy.Expression ((1,), B)
        init_alphas = [-1e4] * self.n_tags
        init_alphas[self.start_idx] = 0

        alphas = dy.inputVector(init_alphas)
        transitions = self.transitions
        # len(emissions) == T
        for emission in emissions:
            add_emission = dy.colwise_add(transitions, emission)
            scores = dy.colwise_add(dy.transpose(add_emission), alphas)
            # dy.logsumexp takes a list of dy.Expression and computes logsumexp
            # elementwise across the lists so for example the logsumexp is calculated
            # for [0] in each list. This means we want the scores for a given
            # transition scores for a tag to be in the columns
            alphas = dy.logsumexp([x for x in scores])
        last_alpha = alphas + dy.pick(transitions, self.end_idx)
        alpha = dy.logsumexp([x for x in last_alpha])
        return alpha
 def attend_with_prev(self, state, w1dt, prev_att):
     w2dt = self.attention_w2 * state
     w3dt = self.attention_w3 * prev_att
     unnormalized = dy.transpose(
         self.attention_v *
         dy.tanh(dy.colwise_add(dy.colwise_add(w1dt, w2dt), w3dt)))
     att_weights = dy.softmax(unnormalized)
     return att_weights
def softmax(x):
    x_max = dy.max_dim(x, 1)
    x_sub = dy.colwise_add(x, -x_max)
    x_exp = dy.exp(x_sub)
    x_sum = dy.sum_cols(x_exp)
    x_tmp = dy.zeroes(x.dim()[0])
    x_tmp = dy.colwise_add(x_tmp, x_sum)
    out = dy.cdiv(x_exp, x_tmp)
    return out
Example #8
    def get_word_att(self, ut, l, s):
        input_mat = dy.concatenate_cols(ut.words_enc)

        unnormalized = dy.transpose(self.attention_word_v * dy.tanh(
                dy.colwise_add(self.attention_word_w1 * input_mat,
                               self.attention_word_w2 * l),
                self.attention_word_w3 * s)))
        att_weights = dy.softmax(unnormalized)

        ut.context = input_mat * att_weights
Example #9
def transform(sentence):
    w1 = dy.parameter(transform_w1)
    b1 = dy.parameter(transform_b1)
    w2 = dy.parameter(transform_w2)
    b2 = dy.parameter(transform_b2)

    sentence_transformed = dy.colwise_add(w1 * sentence, b1)
    sentence_transformed = dy.rectify(sentence_transformed)
    sentence_transformed = dy.colwise_add(w2 * sentence_transformed, b2)
    sentence_transformed = dy.rectify(sentence_transformed)

    return sentence_transformed
Example #10
    def __attention_mlp(self, H_f, h_e, W1_att_e, W1_att_f, w2_att,
                        W1_att_lang, langeb):

        # Calculate the alignment score vector
        a_t = dy.tanh(
            dy.colwise_add(dy.colwise_add(W1_att_f * H_f, W1_att_e * h_e),
                           W1_att_lang * langeb))
        a_t = w2_att * a_t
        a_t = a_t[0]
        alignment = dy.softmax(a_t)
        c_t = H_f * alignment
        return c_t
Example #11
 def __attention_mlp_batch(self, H_f_batch, h_e_batch, W1_att_e, W1_att_f,
                           w2_att, W1_att_lang, langeb):
     # H_f_batch: (2 * hidden_size, num_step, batch_size)
     # h_e_batch: (hidden_size, batch_size)
     a_t_batch = dy.tanh(
             dy.colwise_add(W1_att_f * H_f_batch,
                            W1_att_e * h_e_batch), W1_att_lang *
             langeb))  # (attention_size, num_step, batch_size)
     a_t_batch = w2_att * a_t_batch  # (1, num_step, batch_size)
     a_t_batch = a_t_batch[0]  # (num_step, batch_size)
     alignment_batch = dy.softmax(a_t_batch)  # (num_step, batch_size)
     c_t_batch = H_f_batch * alignment_batch  # (2 * hidden_size, batch_size)
     return c_t_batch
def get_v1_v2(alpha, beta, sen1, sen2, model_params):
    G_w1 = model_params['G_w1']
    G_b1 = model_params['G_b1']
    G_w2 = model_params['G_w2']
    G_b2 = model_params['G_b2']

    con = dy.concatenate([sen1, beta], d=0)
    #con = dy.dropout(con, DROPOUT_RATE)
    v1 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2)

    con = dy.concatenate([sen2, alpha], d=0)
    #con = dy.dropout(con, DROPOUT_RATE)
    v2 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2)

    return v1, v2
def set_E_matrix(sen1, sen2, model_params):
    F_w1 = model_params['F_w1']
    F_b1 = model_params['F_b1']
    F_w2 = model_params['F_w2']
    F_b2 = model_params['F_b2']

    #sen1 = dy.dropout(sen1, DROPOUT_RATE)
    #sen2 = dy.dropout(sen2, DROPOUT_RATE)

    F_sen1 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen1, F_b1))) + F_b2)
    F_sen2 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen2, F_b1))) + F_b2)

    E_matrix = (dy.transpose(F_sen1)) * F_sen2

    return E_matrix, F_sen1, F_sen2
Example #14
    def calc_attention(self, src_trans_att, h_t, training=True):
        with parameters(self.W_h, self.U, trainable=training) as (W_h, U):
            att_hidden = dy.tanh(dy.colwise_add(src_trans_att, W_h * h_t))
            att_weights = dy.transpose(U * att_hidden)
            att_weights = dy.softmax(att_weights)

            return att_weights
Example #15
    def attend(self, encodings, h):
        """Compute attention score
        Given :math:`z_i` the encoder's output at time :math:`i`, :math:`h_{j-1}` the decoder's output at time :math:`j-1`, the attention score is computed as :

        .. math::
                s_{ij}&=V_a^T\tanh(W_az_i + W_{ha}h_j + b_a)\\
            encodings (dynet.Expression): Source sentence encodings obtained with self.encode
            h (dynet.Expression): Decoder output at the previous timestep
            tuple: Two dynet Expressions, the context and the attention weights
        Va, Wa, Wha = self.Va_p.expr(), self.Wa_p.expr(), self.Wha_p.expr()
        d = dy.tanh(dy.colwise_add(Wa * encodings, Wha * h))
        scores = dy.transpose(d) * Va
        weights = dy.softmax(scores)
        context = encodings * weights
        return context, weights
Example #16
    def decode(self, emissions):
        """Viterbi decode to find the best sequence.

        :param emissions: List[dy.Expression]

            List[int], dy.Expression ((1,), B)
        if self.add_ends:
            emissions = CRF._prep_input(emissions)
        backpointers = []
        transitions = self.transitions

        inits = [-1e4] * self.n_tags
        inits[self.start_idx] = 0
        alphas = dy.inputVector(inits)

        for emission in emissions:
            next_vars = dy.colwise_add(dy.transpose(transitions), alphas)
            best_tags = np.argmax(next_vars.npvalue(), 0)
            v_t = dy.max_dim(next_vars, 0)
            alphas = v_t + emission

        terminal_expr = alphas + dy.pick(transitions, self.end_idx)
        best_tag = np.argmax(terminal_expr.npvalue())
        path_score = dy.pick(terminal_expr, best_tag)

        best_path = [best_tag]
        for bp_t in reversed(backpointers):
            best_tag = bp_t[best_tag]
        _ = best_path.pop()
        return best_path, path_score
    def attend_tags(self, state, w1dt):

        w2dt = self.tag_attention_w2 * state
        unnormalized = dy.transpose(self.tag_attention_v *
                                    dy.tanh(dy.colwise_add(w1dt, w2dt)))
        att_weights = dy.softmax(unnormalized)
        return att_weights
Example #18
 def attend(self, w1dt, vectors,state):
     import time
     start = time.time()
     if debug:
         print "In attention"
     w2 = dy.parameter(self.attention_w2)
     v = dy.parameter(self.attention_v)
     if debug:
        print "Shape of w2: ", np.asarray(w2.value()).shape
        print "Shape of state : " , np.asarray(dy.concatenate(list(state.s())).value()).shape
     end = time.time()
     start = end
     w2dt = w2 * dy.concatenate(list(state.s()))
     end = time.time()
     if debug:
         print " Shape of W2dt: ", np.asarray(w2dt.value()).shape
     unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))
     if debug:
        print "Shape of unnormalized: ", np.asarray(unnormalized.value()).shape
     end = time.time()
     att_weights = dy.softmax(unnormalized)
     if debug:
         print "Shape of Attention weights: ", np.asarray(att_weights.value()).shape
     end = time.time()
     context = vectors * att_weights
     if debug:
        print "Shape of context: ", np.asarray(context.value()).shape
        #print "Context: " , np.asarray(context.value())
     return context
Example #19
 def generate(self, h_a, trg, maxlen=100):
     #decode(self, h_a, trg, decorate=False):
     h_a += ([dy.zeros(self.hdim)] * (self.max_len - len(h_a))
             )  #padding to make equal to maxlength
     h_ak = dy.concatenate(h_a, 1)
     pre_attend = dy.parameter(self.pre_attend)
     context = h_ak * pre_attend
     prev_out = dy.zeros((self.hdim))
     outputs = []
     s = self.decoder_rnn.initial_state()
     for i in range(maxlen):
         attender = dy.parameter(self.attender)
         V = dy.parameter(self.v)
         tmp = dy.tanh(dy.colwise_add(context, V * prev_out))
         U = dy.parameter(self.u)
         attention_weights = dy.softmax(dy.transpose(U * tmp))
         emb = dy.concatenate([h_ak * attention_weights, prev_out])
         s = s.add_input(emb)
         prev_out = s.output()
         pre2 = dy.parameter(self.pred)
         pre2 * prev_out
         outputs.append(pre2 * prev_out)
         act_value = pre2 * prev_out
         act_value = np.argmax(act_value.value())
         if act_value == 1:
             return outputs
     return outputs
def attend(input_mat, state, w1dt_array):
# Takes in [l * 2hE * n], [l * 2hD * n], [l * a * n] as input and returns [l * a * n] as output 
    global attention_w2
    global attention_v
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)
    if debug_dimensions:
      print "In attention "
      print "   Dimensions of input mat are : ", get_tensor_size(input_mat)
      print "   Dimensions of w1dt array: ", get_tensor_size(w1dt_array)
      print "   Dimensions of state ", len(state)
    # Get w2dt = weight matrix * decoder state output
    w2dt_array = []      
    for s in state:
      w2dt =  w2*dy.concatenate(list(s.s()))
    if debug_dimensions:
      print "   Dimensions of w2dt array: ", get_tensor_size(w2dt_array) 
    unnormalized_array = []
    att_weights_array = []
    for (a,b) in zip(w1dt_array, w2dt_array): 
        unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(a,b)))
        att_weights = dy.softmax(unnormalized)
    if debug_dimensions:
       print "   Dimensions of attention weights array: ", get_tensor_size(att_weights_array)
    context_array = []
    for (im, at) in zip(input_mat, att_weights_array):    
        context = im * at
    if debug_dimensions: 
       print "   Dimensions of contexts array: ", get_tensor_size(context_array)
    return context_array
Example #21
 def cal_scores(self, s):
     if len(self.ps) == 0:
         return None
     hs_matrix = dy.tanh(
                        W * s))
     return dy.softmax(dy.transpose(b * hs_matrix))
Example #22
def _vaswani_model_scores(m):
    out_c2 = dy.rectify(
        dy.colwise_add(c2_Wlm * m["beam_lm_hs"],
                       dy.pick(m["aux_c2"], m["idx"], 1)))

    # if cfg["use_beam_bilstm"]:
    #     _, beam_size_prev = out_c2.dim()[0]
    #     beam_hs = [dy.pick(out_c2, i, 1) for i in xrange(beam_size_prev)]
    #     bf_init = b_fwd.initial_state()
    #     bb_init = b_bwd.initial_state()
    #     bf_hs = dy.concatenate_cols(bf_init.transduce(beam_hs))
    #     bb_hs = dy.concatenate_cols(bb_init.transduce(reversed(beam_hs))[::-1])
    #     out_c2 = dy.concatenate([bf_hs, bb_hs])

    # if cfg["use_beam_mlp"]:
    #     out_b = dy.max_dim(b_W1 * out_c2 + b_b1, 1)
    #     out_c2 = dy.colwise_add(out_c2, dy.rectify(b_W2 * out_b + b_b2))

    scores = o_W * out_c2 + o_b
    scores = dy.transpose(scores)
    if cfg["accumulate_scores"]:
        scores = m["acc_scores"] + scores
        m["scores"] = scores

    return scores
Example #23
def viterbi(emissions, transition, start_idx, end_idx, norm=False):
    n_tags = emissions[0].dim()[0][0]
    backpointers = []

    inits = [-1e4] * n_tags
    inits[start_idx] = 0
    alphas = dy.inputVector(inits)
    alphas = dy.log_softmax(alphas) if norm else alphas

    for emission in emissions:
        next_vars = dy.colwise_add(dy.transpose(transition), alphas)
        best_tags = np.argmax(next_vars.npvalue(), 0)
        v_t = dy.max_dim(next_vars, 0)
        alphas = v_t + emission

    terminal_expr = alphas + dy.pick(transition, end_idx)
    best_tag = np.argmax(terminal_expr.npvalue())
    path_score = dy.pick(terminal_expr, best_tag)

    best_path = [best_tag]
    for bp_t in reversed(backpointers):
        best_tag = bp_t[best_tag]
    _ = best_path.pop()
    return best_path, path_score
Example #24
 def attend(self, enc_h_ts_mat, dec_h_t, encatt, store_weights=False):
     enc_h_ts_mat: dynet.Expression, (seq_len x enc_hid_dim)
         matrix of encoding hidden state column vectors
     dec_h_t: dynet.RNNState, (dec_hid_dim)
         current decoder hidden state
     encatt: dynet.Expression, (seq_len x att_dim)
         projection of the encoder hidden states into the attention space
     store_weights: bool,
         whether to store attention weights
     dec2att = dy.parameter(self.dec2att)
     att_v = dy.parameter(self.att_v)
     # project output of last hidden layer (state.h()[-1] == state.output())
     # to the dimensionality of the attention space
     decatt = dec2att * dec_h_t.output()
     # projection vector att_v
     # unnormalized var-len alignment vector (with len == source seq len)
     # (seq_len)
     unnormalized_weights = att_v * dy.tanh(dy.colwise_add(encatt, decatt))
     weights = dy.softmax(dy.transpose(unnormalized_weights))
     if store_weights:
     context = enc_h_ts_mat * weights
     return context
Example #25
def combine(sentence, sentence_other_attended):
    w1 = dy.parameter(combine_w1)
    b1 = dy.parameter(combine_b1)
    w2 = dy.parameter(combine_w2)
    b2 = dy.parameter(combine_b2)

    sentence_combine = dy.concatenate([sentence, sentence_other_attended], d=0)
    logging.debug("Sentence combined with Attended shape: " +

    combine_transformed = dy.colwise_add(w1 * sentence_combine, b1)
    combine_transformed = dy.rectify(combine_transformed)
    combine_transformed = dy.colwise_add(w2 * combine_transformed, b2)
    combine_transformed = dy.rectify(combine_transformed)

    return combine_transformed
Example #26
def viterbi(emissions, transition, start_idx, end_idx, norm=False):
    n_tags = emissions[0].dim()[0][0]
    backpointers = []

    inits = [-1e4] * n_tags
    inits[start_idx] = 0
    alphas = dy.inputVector(inits)
    alphas = dy.log_softmax(alphas) if norm else alphas

    for emission in emissions:
        next_vars = dy.colwise_add(dy.transpose(transition), alphas)
        best_tags = np.argmax(next_vars.npvalue(), 0)
        v_t = dy.max_dim(next_vars, 0)
        alphas = v_t + emission

    terminal_expr = alphas + dy.pick(transition, end_idx)
    best_tag = np.argmax(terminal_expr.npvalue())
    path_score = dy.pick(terminal_expr, best_tag)

    best_path = [best_tag]
    for bp_t in reversed(backpointers):
        best_tag = bp_t[best_tag]
    _ = best_path.pop()
    return best_path, path_score
Example #27
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component):
    w1_att_src = dy.parameter(w1_att_src_p)
    w1_att_tgt = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att
    alignment = dy.softmax(a_t)
    att_output = src_output_matrix * alignment
    return att_output, alignment
def calc_attention(src_output_matrix, tgt_output_embedding, fixed_attentional_component):
    w1_att_src = dy.parameter(w1_att_src_p)
    w1_att_tgt = dy.parameter(w1_att_tgt_p)
    w2_att = dy.parameter(w2_att_p)
    a_t = dy.transpose(dy.tanh(dy.colwise_add(fixed_attentional_component, w1_att_tgt * tgt_output_embedding))) * w2_att
    alignment = dy.softmax(a_t)
    att_output = src_output_matrix * alignment
    return att_output, alignment
Example #29
    def calc_attention(self, state):
        V = dy.parameter(self.pV)
        U = dy.parameter(self.pU)

        h = dy.tanh(dy.colwise_add(self.WI, V * state))
        scores = dy.transpose(U * h)

        return dy.softmax(scores)
 def attend(self, input_mat, state, w1dt):
     w2 = dy.parameter(self.attention_w2)
     v = dy.parameter(self.attention_v)
     w2dt = w2 * dy.concatenate(list(state.s()))
     att_weights = dy.softmax(
         dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt))))
     context = input_mat * att_weights
     return context
Example #31
    def get_utt_att(self, uts, s):
        input_mat = dy.concatenate_cols([u.utt_enc for u in uts])

        unnormalized = dy.transpose(self.attention_word_v * dy.tanh(
            dy.colwise_add(self.attention_utt_w1 * input_mat,
                           self.attention_utt_w2 * s)))
        att_weights = dy.softmax(unnormalized)

        return input_mat * att_weights
Example #32
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     projected_state = self.decoder * query  # ((H,), B)
     non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state))  # ((H, T), B)
     attn_scores = dy.transpose(self.v * non_lin)  # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Example #33
  def calc_attention(self, state):
    V = dy.parameter(self.pV)
    U = dy.parameter(self.pU)

    h = dy.tanh(dy.colwise_add(self.WI, V * state))
    scores = dy.transpose(U * h)
    normalized = dy.softmax(scores)
    return normalized
 def __attention_mlp(self, h_fs_matrix, h_e, fixed_attentional_component):
     W1_att_e = dy.parameter(self.W1_att_e)
     w2_att = dy.parameter(self.w2_att)
     a_t = dy.transpose(
                                W1_att_e * h_e))) * w2_att
     alignment = dy.softmax(a_t)
     c_t = h_fs_matrix * alignment
     return c_t
Example #35
def attend(input_mat, state, w1dt):
    global attention_w2
    global attention_v
    w2 = dy.parameter(attention_w2)
    v = dy.parameter(attention_v)

    # input_mat: (encoder_state x seqlen) => input vecs concatenated as cols
    # w1dt: (attdim x seqlen)
    # w2dt: (attdim x attdim)
    w2dt = w2*dy.concatenate(list(state.s()))
    # att_weights: (seqlen,) row vector
    unnormalized = dy.transpose(v * dy.tanh(dy.colwise_add(w1dt, w2dt)))
    att_weights = dy.softmax(unnormalized)
    # context: (encoder_state)
    context = input_mat * att_weights
    return context
Example #36
    def cal_scores(self, src_encodings):
        src_len = len(src_encodings)

        src_encodings = dy.concatenate_cols(src_encodings)  # src_ctx_dim, src_len, batch_size

        W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head)
        b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head)
        W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep)
        b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep)

        W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head)
        b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head)
        W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep)
        b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep)

        U_arc_1 = dy.parameter(self.U_arc_1)
        u_arc_2 = dy.parameter(self.u_arc_2)

        U_label_1 = [dy.parameter(x) for x in self.U_label_1]
        u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1]
        u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2]
        b_label = [dy.parameter(x) for x in self.b_label]

        h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings]))  # n_arc_ml_units, src_len, bs
        h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings]))
        h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings]))
        h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings]))

        h_arc_head_transpose = dy.transpose(h_arc_head)
        h_label_head_transpose = dy.transpose(h_label_head)

        s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2)

        s_label = []
        for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label):
            e1 = h_label_head_transpose * U_1 * h_label_dep
            e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len))
            e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep
            s_label.append(e1 + e2 + e3 + b)
        return s_arc, s_label