Ejemplo n.º 1
0
    def forward(self, x):
        #x.shape: (batchsize, 8, 1152, 1, 1)
        #routing_weight.shape: (1, 1, 1152, 1, 10)
        routing_weight = nd.softmax(nd.zeros(shape=(1, 1, self.ni, 1, self.no),
                                             ctx=x.context),
                                    axis=1)

        #u.shape: (batchsize, 1, 1152, 16, 10)
        u = nd.sum(x * self.W.data(), axis=1, keepdims=True)

        #s.shape: (batchsize, 1, 1, 16, 10)
        s = nd.sum(u * routing_weight, axis=2, keepdims=True)

        #v.shape: (batchsize, 1, 1, 16, 10)
        v = Squash(s, axis=3)

        for i in range(self.nr):

            #print(i, nd.sum(nd.sum(nd.sum(nd.square(u*v), axis=3, keepdims=True), axis=2, keepdims=True).reshape((self.bs,10)),axis=1))
            routing_weight = routing_weight + nd.sum(
                u * v, axis=3, keepdims=True)
            c = nd.softmax(routing_weight, axis=2)
            s = nd.sum(u * c, axis=2, keepdims=True)
            v = Squash(s, axis=3)

        return nd.reshape(v, shape=(-1, self.lvo, self.no))
Ejemplo n.º 2
0
    def forward(self, x):

        if self.routing is not None:
            routing_weight = nd.softmax(nd.zeros(shape=(1, 1, self.num_points),
                                                 ctx=x.context),
                                        axis=2)
        trans = self.stn(x)
        x = nd.transpose(x, (0, 2, 1))
        x = nd.batch_dot(x, trans)
        x = nd.transpose(x, (0, 2, 1))
        x = nd.relu(self.bn1(self.conv1(x)))
        pointfeat = x
        x = nd.relu(self.bn2(self.conv2(x)))
        x = self.bn3(self.conv3(x))
        if self.routing is not None:
            s = nd.sum(x * routing_weight, axis=2, keepdims=True)
            # v = Squash(s, axis=1)
            for _ in range(self.routing):
                routing_weight = routing_weight + nd.sum(
                    x * s, axis=1, keepdims=True)
                c = nd.softmax(routing_weight, axis=2)
                s = nd.sum(x * c, axis=2, keepdims=True)
                # v = Squash(s, axis=1)
            x = s
        else:
            x = self.mp1(x)
        if self.global_feat:
            return x, trans
        else:
            x = x.repeat(self.num_points, axis=2)
            return nd.concat(x, pointfeat, dim=1), trans
Ejemplo n.º 3
0
 def forward(self, c, q):
     x = nd.concat(c, q, c * q)
     S = self.dense(x)
     S_bar = nd.softmax(S, axis=1)
     S_2bar = nd.softmax(S, axis=2)
     A = S_bar * Q.T
     B = S_bar * S_2_bar.T * c.T
     return A, B
Ejemplo n.º 4
0
    def forward(self, input_data):
        freq = input_data[:, 0:2].expand_dims(1)
        input_data = input_data[:, 2:]
        e1_vec_start = FIXED_WORD_LENGTH * DIMENSION
        x = input_data[:, :e1_vec_start].reshape(
            (input_data.shape[0], FIXED_WORD_LENGTH,
             DIMENSION))  # (m, 60, 110)

        e1neimask = input_data[:, e1_vec_start:e1_vec_start +
                               MASK_LENGTH]  # (m, 51)
        e1edge = input_data[:, e1_vec_start + MASK_LENGTH:e1_vec_start +
                            MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape(
                                (input_data.shape[0], ENTITY_DEGREE,
                                 WORD_DIMENSION * 2))  # (m, 51, 200)
        e1neigh = e1edge[:, :, :WORD_DIMENSION]

        e2_vec_start = e1_vec_start + MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH
        e2neimask = input_data[:, e2_vec_start:e2_vec_start +
                               MASK_LENGTH]  # (m, 51)
        e2edge = input_data[:, e2_vec_start + MASK_LENGTH:e2_vec_start +
                            MASK_LENGTH + ENTITY_EDGE_VEC_LENGTH].reshape(
                                (input_data.shape[0], ENTITY_DEGREE,
                                 WORD_DIMENSION * 2))  # (m, 51,200)
        e2neigh = e2edge[:, :, :WORD_DIMENSION]

        gru = self.gru
        x = nd.transpose(x, axes=(1, 0, 2))
        h = gru(x)
        ht = nd.transpose(h, axes=(1, 0, 2))
        gru_out = self.gru_out
        y1 = gru_out(ht.expand_dims(1))  # (m,200)

        att = self.center_att
        e1edge = nd.tanh(e1edge)
        e1g = att(e1edge) * freq[:, :, :1]  # (m,51,1)
        e1g = e1g * e1neimask.expand_dims(2)
        e1g = nd.softmax(e1g, axis=1)
        e1gt = nd.transpose(e1g, axes=(0, 2, 1))  # (m,1,151)
        e1n = nd.batch_dot(e1gt, e1neigh)  # (m,1,100)
        e1n = e1n.reshape((e1n.shape[0], 100))  # (m,100)

        e2edge = nd.tanh(e2edge)
        e2g = att(e2edge) * freq[:, :, 1:]  # (m,51,1)
        e2g = e2g * e2neimask.expand_dims(2)
        e2g = nd.softmax(e2g, axis=1)
        e2gt = nd.transpose(e2g, axes=(0, 2, 1))  # (m,1,151)
        e2n = nd.batch_dot(e2gt, e2neigh)  # (m,1,100)
        e2n = e2n.reshape((e2n.shape[0], 100))  # (m,100)

        center_y = nd.concat(e1n, e2n, dim=1)  # (m,200)
        center_out = self.center_out
        center_y = center_out(center_y)

        out = self.output
        y4 = nd.concat(y1, center_y, dim=1)
        y5 = out(y4)
        return y5
def classify(net,image):
    # one gpu is necessary
    if  len(image):
        transformed_img = data.transforms.presets.imagenet.transform_eval(mx.nd.array(image).as_in_context(gpu(0)))
        pred = net(transformed_img)
        ind = nd.argmax(pred, axis=1).astype('int')
        action = class_list[ind.asscalar()]
        print(action, nd.softmax(pred)[0][ind].asscalar())
        return action,nd.softmax(pred)[0][ind].asscalar()
    else:
        return "None Action", 0.0
Ejemplo n.º 6
0
    def forward(self, inputs, begin_state=None): # pylint: disable=arguments-differ
        """Implement the forward computation that the awd language model and cache model use.

        Parameters
        -----------
        inputs : NDArray
            input tensor with shape `(sequence_length, batch_size)`
            when `layout` is "TNC".
        begin_state : list
            initial recurrent state tensor with length equals to num_layers.
            the initial state with shape `(1, batch_size, num_hidden)`

        Returns
        --------
        out: NDArray
            output tensor with shape `(sequence_length, batch_size, input_size)`
            when `layout` is "TNC".
        out_states: list
            output recurrent state tensor with length equals to num_layers.
            the state with shape `(1, batch_size, num_hidden)`
        encoded_raw: list
            The list of outputs of the model's encoder with length equals to num_layers.
            the shape of every encoder's output `(sequence_length, batch_size, num_hidden)`
        encoded_dropped: list
            The list of outputs with dropout of the model's encoder with length equals
            to num_layers. The shape of every encoder's dropped output
            `(sequence_length, batch_size, num_hidden)`
        """
        encoded = self.embedding(inputs)
        if not begin_state:
            begin_state = self.begin_state(batch_size=inputs.shape[1])
        out_states = []
        encoded_raw = []
        encoded_dropped = []
        for i, (e, s) in enumerate(zip(self.encoder, begin_state)):
            encoded, state = e(encoded, s)
            encoded_raw.append(encoded)
            out_states.append(state)
            if self._drop_h and i != len(self.encoder)-1:
                encoded = nd.Dropout(encoded, p=self._drop_h, axes=(0,))
                encoded_dropped.append(encoded)
        if self._dropout:
            encoded = nd.Dropout(encoded, p=self._dropout, axes=(0,))
        encoded_dropped.append(encoded)
        latent = nd.Dropout(self.latent(encoded), p=self._drop_l, axes=(0,))
        logit = self.decoder(latent.reshape(-1, self._embed_size))
        prior_logit = self.prior(encoded).reshape(-1, self._num_experts)
        prior = nd.softmax(prior_logit)
        prob = nd.softmax(logit.reshape(-1, self._vocab_size))
        prob = prob.reshape(-1, self._num_experts, self._vocab_size)
        prob = (prob * prior.expand_dims(2).broadcast_to(prob.shape)).sum(axis=1)
        out = nd.log(nd.add(prob, 1e-8)).reshape(-1, inputs.shape[1], self._vocab_size)
        return out, out_states, encoded_raw, encoded_dropped
Ejemplo n.º 7
0
def test_accur(target, it, *input):
    LambdaMin = 5.0
    LambdaMax = 1500.0
    lamb = 1500.0
    theta, phi = input
    batch_size = target.size
    lamb = max(LambdaMin, LambdaMax / (1 + 0.1 * it))
    # because indexing is not differentiable in mxnet, we must do this
    output = theta - theta / (1 + lamb) + phi / (1 + lamb)
    nd.softmax(output, out=output)
    v, idx = nd.topk(output, ret_typ='both')
    real = (idx == target.reshape(-1, 1).astype(idx.dtype))
    return nd.sum(real) / batch_size, nd.sum(real * v) / batch_size
    def forward(self, cur_input, state, encoder_outputs):
        # 当循环神经网络有多个隐藏层时,取靠近输出层的单层隐藏状态
        single_layer_state = [state[0][-1].expand_dims(0)]

        #encoder_output的shape是(max_seq_len,-1,encoder_num_hiddens)
        encoder_outputs = encoder_outputs.reshape(
            (self.max_seq_len, -1, self.encoder_num_hiddens))

        hidden_broadcast = nd.broadcast_axis(single_layer_state[0],
                                             axis=0,
                                             size=self.max_seq_len)
        encoder_outputs_and_hiddens = nd.concat(encoder_outputs,
                                                hidden_broadcast,
                                                dim=2)

        energy = self.attention(encoder_outputs_and_hiddens)

        batch_attention = nd.softmax(energy, axis=0)

        batch_attention = nd.softmax(energy, axis=0).transpose((1, 2, 0))
        #print(batch_attention.shape)
        batch_encoder_outputs = encoder_outputs.swapaxes(0, 1)
        decoder_context = nd.batch_dot(batch_attention, batch_encoder_outputs)
        #改这里
        input_and_context = nd.concat(nd.expand_dims(self.embedding(cur_input),
                                                     axis=1),
                                      decoder_context,
                                      dim=2)
        concat_input = self.rnn_concat_input(input_and_context).reshape(
            (1, -1, 0))

        concat_input = self.dropout(concat_input)

        state = [
            nd.broadcast_axis(single_layer_state[0],
                              axis=0,
                              size=self.num_layers)
        ]

        output, state = self.rnn(concat_input, state)

        output = self.dropout(output)
        #print('output.shape:\n')
        #print(output.shape)
        output = self.out(output)
        #print('dense shape:\n')
        #print(output.shape)
        output = output.reshape((-3, -1))
        return output, state
Ejemplo n.º 9
0
 def _get_co_attention(as_, bs_, r, lamb=k_lambda):
     """
     as_, bs_: (batch_size, seq_len, embed_size)
     r: (batch_size, seq_len, seq_len, 5)
     """
     e = nd.batch_dot(as_, bs_, transpose_b=True) + lamb * F(
         r, ctx)  # (batch_size, seq_len, seq_len,)
     alpha = nd.softmax(e, axis=2)  # alpha_ij = exp(eij) / SUM_k(exp(eik))
     beta = nd.softmax(e, axis=1)  # beta_ij = exp(ij) / SUM_k(exp(ekj))
     beta = nd.transpose(beta,
                         axes=[0, 2,
                               1])  # transpose becasue of softmax axis=1
     ac = nd.batch_dot(alpha, bs_)  #
     bc = nd.batch_dot(beta, as_)
     return ac, bc, alpha, beta
Ejemplo n.º 10
0
    def forward(self, inputs, begin_state=None):
        """Implement forward computation.

        Parameters
        ----------
        inputs : NDArray
            The training dataset.
        begin_state : list
            The initial hidden states.

        Returns
        -------
        out: NDArray
            The output of the model.
        out_states: list
            The list of output states of the model's encoder.
        """
        encoded = self.embedding(inputs)
        if not begin_state:
            begin_state = self.begin_state(batch_size=inputs.shape[1])
        out_states = []
        encoded_raw = []
        encoded_dropped = []
        for i, (e, s) in enumerate(zip(self.encoder, begin_state)):
            encoded, state = e(encoded, s)
            encoded_raw.append(encoded)
            out_states.append(state)
            if self._drop_h and i != len(self.encoder) - 1:
                encoded = nd.Dropout(encoded, p=self._drop_h, axes=(0, ))
                encoded_dropped.append(encoded)
        if self._dropout:
            encoded = nd.Dropout(encoded, p=self._dropout, axes=(0, ))
        states = out_states
        encoded_dropped.append(encoded)

        latent = nd.Dropout(self.latent(encoded), p=self._drop_l, axes=(0, ))
        logit = self.decoder(latent.reshape(-1, self._embed_size))
        prior_logit = self.prior(encoded).reshape(-1, self._num_experts)
        prior = nd.softmax(prior_logit)

        prob = nd.softmax(logit.reshape(-1, self._vocab_size))
        prob = prob.reshape(-1, self._num_experts, self._vocab_size)
        prob = (prob *
                prior.expand_dims(2).broadcast_to(prob.shape)).sum(axis=1)
        out = nd.log(nd.add(prob, 1e-8)).reshape(-1, inputs.shape[1],
                                                 self._vocab_size)

        return out, out_states, encoded_raw, encoded_dropped
Ejemplo n.º 11
0
    def msg_reduce(self, node):
        state = node.mailbox['state']
        alpha = node.mailbox['alpha']
        alpha = nd.softmax(alpha, axis=1)

        new_state = nd.relu(nd.sum(alpha * state, axis=1))
        return {'new_state': new_state}
Ejemplo n.º 12
0
 def route(self, x):
     '''
     b_mat = nd.zeros((x.shape[0], self.num_cap_in, self.num_cap, 1, x.shape[4], x.shape[5]), ctx=x.context)
     c_mat = nd.softmax(b_mat, axis=2)
     # s = nd.sum(x/self.num_cap, axis=1)
     s = nd.sum(x*c_mat, axis=1)
     # print x.reshape((x.shape[0],self.num_cap,-1,x.shape[4], x.shape[5]))[0,0,0,0,0]
     # print s[0,0,0,0,0]
     # print s1[0,0,0,0,0]
     # u_no_gradient = nd.stop_gradient(x)
     # s = nd.sum(u_no_gradient* c_mat, axis=1)
     v = squash(s, 2)
     '''
     b_mat = nd.zeros((x.shape[0], self.num_cap_in, self.num_cap, 1,
                       x.shape[4], x.shape[5]),
                      ctx=x.context)
     u = x
     u_no_gradient = nd.stop_gradient(x)
     for i in range(self.route_num):
         # print i, nd.max(u).asnumpy()[0], nd.min(u).asnumpy()[0]
         c_mat = nd.softmax(b_mat, axis=2)
         if i == self.route_num - 1:
             s = nd.sum(u * c_mat, axis=1)
         else:
             s = nd.sum(u_no_gradient * c_mat, axis=1)
         v = squash(s, 2)
         v1 = nd.expand_dims(v, axis=1)
         if i != self.route_num - 1:
             update_term = nd.sum(u_no_gradient * v1, axis=3, keepdims=True)
             b_mat = b_mat + update_term
     # print v.shape
     # v = nd.transpose(v, (0,2,1,3,4))
     return v
Ejemplo n.º 13
0
    def calculation(self, input_str, char_indices, indices_char, input_digits = 9, lchars = 14, ctx = mx.cpu()):
        input_str = 'S' + input_str + 'E'
        X = nd.zeros((1, input_digits, lchars), ctx = ctx)
        for t, char in enumerate(input_str):
            X[0, t, char_indices[char]] = 1
        Y_init = nd.zeros((1, lchars), ctx = ctx)
        Y_init[0, char_indices['S']] = 1
        begin_state = self.encoder.begin_state(batch_size = 1, ctx = ctx)
        enout, (h, c) = self.encoder(X, begin_state)
        next_h = h[1]
        next_c = c[1]
        deout = Y_init
        
        for i in range(self.out_seq_len):
            deout, (next_h, next_c) = self.decoder(deout, [next_h, next_c])
            deout = nd.expand_dims(deout, axis = 1)
            deout = self.batchnorm(deout)
            deout = deout[:, 0, :]

            deout_sm = self.dense(deout)
            deout = nd.one_hot(nd.argmax(nd.softmax(deout_sm, axis = 1), axis = 1), depth = self.vocab_size)
            if i == 0:
                ret_seq = indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]
            else:
                ret_seq += indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]

            if ret_seq[-1] == ' ' or ret_seq[-1] == 'E':
                break
        return ret_seq.strip('E').strip()
Ejemplo n.º 14
0
def dot_attention(query, key, value, mask, dropout=0.0):
    # query: (batch_size, h, length_q, model_dim/h)
    # key:   (batch_size, h, length_k, model_dim/h)
    # value: (batch_size, h, length_k, model_dim/h)

    query_shape = query.shape
    query = query.reshape(-3, -2)
    key = key.reshape(-3, -2)
    value = value.reshape(-3, -2)

    # matmul, t: (batch_size*h, length_q, length_k)
    t = nd.batch_dot(query, key.swapaxes(1, 2)) / math.sqrt(query.shape[-1])

    # masked
    # mask PAD and future words
    m = nd.full(t.shape, LARGE_NEGATIVE_VALUE)
    mask = nd.ones(t.shape) * mask
    t = nd.where(mask, t, m)

    # softmax
    t = nd.softmax(t, axis=-1)
    if dropout > 0.0:
        t = nd.dropout(t, p=dropout)

    # (batch_size, h, length_q, model_dim/h)
    return nd.batch_dot(t, value).reshape(query_shape)
Ejemplo n.º 15
0
 def Route(self, x):
     # b_mat = nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0)#nd.stop_gradient(nd.repeat(self.b_mat.data(), repeats=x.shape[0], axis=0))
     b_mat = nd.zeros((x.shape[0], 1, self.num_cap, self.num_locations),
                      ctx=x.context)
     x_expand = nd.repeat(nd.expand_dims(x, 2),
                          repeats=self.num_cap,
                          axis=2)
     x_expand = nd.repeat(nd.expand_dims(x_expand, axis=2),
                          repeats=self.units,
                          axis=2)
     w_expand = nd.expand_dims(self.w_ij.data(), axis=0)
     u_ = w_expand * x_expand
     u = nd.sum(u_, axis=1)
     u_no_gradient = nd.stop_gradient(u)
     for i in range(self.route_num):
         c_mat = nd.softmax(b_mat, axis=2)
         if i == self.route_num - 1:
             s = nd.sum(u * c_mat, axis=-1)
         else:
             s = nd.sum(u_no_gradient * c_mat, axis=-1)
         v = squash(s, 1)
         v1 = nd.expand_dims(v, axis=-1)
         if i != self.route_num - 1:
             update_term = nd.sum(u_no_gradient * v1, axis=1, keepdims=True)
             b_mat = b_mat + update_term
     return v
Ejemplo n.º 16
0
def mxnet_cifar10(im):

    img = image.imread(im)

    # plt.imshow(img.asnumpy())
    # plt.show()

    # transform image
    transform_fn = transforms.Compose([
        transforms.Resize(32),
        transforms.CenterCrop(32),
        transforms.ToTensor(),
        transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
    ])

    img = transform_fn(img)
    # plt.imshow(nd.transpose(img, (1,2,0)).asnumpy())
    # plt.show()

    # load pre-trained model
    net = get_model('cifar_resnet110_v1', classes=10, pretrained=True)

    # predict class
    pred = net(img.expand_dims(axis=0))

    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                'dog', 'frog', 'horse', 'ship', 'truck']
    ind = nd.argmax(pred, axis=1).astype('int')
    return [str(class_names[ind.asscalar()]), str(round(nd.softmax(pred)[0][ind].asscalar(), 2))]    
Ejemplo n.º 17
0
def predict_all(X, net, ctx, dfeat, batch_size=64, cnn_flag=False):
    '''
    :param X: an ndarray containing the data. The first axis is over examples
    :param net: trained model
    :param dfeat: the dimensionality of the vectorized feature
    :param batchsize: batchsize used in iterators. default is 64.
    :return: Two ndarrays containing the soft and hard predictions of the classifier.
    '''

    data_iterator = mx.io.NDArrayIter(X, None, batch_size, shuffle=False)
    ypred_soft = []
    ypred = []

    for i, batch in enumerate(data_iterator):
        if cnn_flag:
            data = batch.data[0].as_in_context(ctx)
        else:
            data = batch.data[0].as_in_context(ctx).reshape((-1, dfeat))
        output = net(data)
        softpredictions = nd.softmax(output, axis=1)
        predictions = nd.argmax(output, axis=1)
        ypred_soft.append(softpredictions)
        ypred.append(predictions)

    ypred_soft_all = nd.concatenate(ypred_soft, axis=0)
    ypred_all = nd.concatenate(ypred, axis=0)

    # iterator automatically pads the last minibatch, so the length of the vectors might be different.
    ypred_all = ypred_all[:X.shape[0]]
    ypred_soft_all = ypred_soft_all[:X.shape[0], ]

    return ypred_all, ypred_soft_all
Ejemplo n.º 18
0
 def forward(self, x, _mask=None):
     map1 = self.linear1(x)
     map2 = self.linear2(map1)
     #map2 = exp_mask_for_tensor(map2, x_mask)
     soft = nd.softmax(map2, axis=self.axis)
     out = (soft * x).sum(axis=self.axis)
     return out
Ejemplo n.º 19
0
def targetClassify(model_name,input_pic,target_class):
    # The purpose of this is to simply output the percent probability that
    # the image is specified target_class
    # Load specified Model
    # Assume pretrained
    net = get_model(model_name, pretrained=True)
    
    classes = net.classes
    
    classInd = -1;
    # Find index of target class
    for i,j in enumerate(classes):
        if target_class == j.lower():
            classInd = i
            break
        
    # Exit if target class not found
    if classInd == -1:
        print("ERROR: Target class not found in this model : %s" % target_class)
        return            
    
    # Load Images, assume all data is in "images/" directory
    img = image.imread("images/" + input_pic)
    
    # Transform and predict
    img = transform_eval(img)
    pred = net(img)
    # use softmax and print probability
    #prob = nd.softmax(pred)
    print("Probability of class [%s] for [%s]: %.3f" % (classes[classInd],input_pic,nd.softmax(pred)[0][classInd].asscalar()))   
    
Ejemplo n.º 20
0
def predictor_0(neural_network, features_npy, use_softmax=False):
    fmx = nd.array(features_npy)
    outputs = neural_network(fmx)
    if use_softmax:
        outputs = nd.softmax(outputs)
    outputs = outputs.asnumpy()
    return outputs
Ejemplo n.º 21
0
    def forward(self, x, x_mask=None):
        N, T, D = tuple(x.shape)  # bs, sl, vec
        bs, sl, vec = tuple(x.shape)
        direct_mask = get_direct_mask(bs, sl, self.direction)
        #x_mask_tile = x_mask.expand_dims(1)
        #mask = np.logical_and(direct_mask, x_mask_tile).astype(float)
        mask = direct_mask.astype('float32')
        x_map = self.linear1(x)  # bs, sl, vec
        #x_map_tile = x_map.expand_dims(1) #
        x_map_tile = nd.tile(x_map.expand_dims(1),
                             (1, sl, 1, 1))  # bs, sl, sl, vec
        x_map_drop = self.dropout(x_map)

        dependent = self.linear2(x_map_drop)
        dependent_etd = dependent.expand_dims(1)
        head = self.linear3(x_map_drop)
        head_etd = head.expand_dims(2)
        loggits = scaled_tanh(dependent_etd + head_etd + self.f_bias, 5.0)

        loggits_masked = exp_mask_for_tensor(loggits, mask)
        attn_score = nd.softmax(loggits_masked, 2)
        attn_score = mask_for_tensor(attn_score, mask)

        attn_result = (attn_score * x_map_tile).nansum(2)
        fusion_gate = nd.sigmoid(
            self.linear4(x_map) + self.linear5(attn_result) + self.o_bias)
        output = fusion_gate * x_map + (1 - fusion_gate) * attn_result
        return output
Ejemplo n.º 22
0
 def forward(self, x):
     with x.context:
         keys = self.key_layer(x)
         queries = self.query_layer(x)
         values = self.value_layer(x)
         logits = nd.linalg_gemm2(queries, keys.swapaxes(2, 1))
         if self.show_shape:
             print("keys shape:{}".format(keys.shape))
             print("queries shape:{}".format(queries.shape))
             print("logits shape:{}".format(logits.shape))
         #Generate masking part
         mask = np.full(shape=(logits.shape[1], logits.shape[2]), fill_value=1).astype('float')
         mask = np.triu(mask, 1)
         mask = np.expand_dims(mask, 0)
         mask = np.repeat(mask, logits.shape[0], 0)
         np.place(mask, mask == 1, 0.0)
         np.place(mask, mask == 0, 1.0)
         mask = nd.array(mask)
         logits = nd.elemwise_mul(logits, mask)
         probs = nd.softmax(logits / self.sqrt_k, axis=2)
         if self.show_shape:
             print("probs shape:{}".format(probs.shape))
             print("values shape:{}".format(values.shape))
         read = nd.linalg_gemm2(probs, values)
         concat_data = nd.concat(x, read, dim=2)
         return concat_data
Ejemplo n.º 23
0
def dev(ch_bert, model, ch_vocab, dev_dataiter, logger, ctx):
    TP_s = 0
    FP_s = 0
    FN_s = 0
    example_ids = []
    for content, token_types, valid_len, label, example_id in tqdm(
            dev_dataiter):
        example_ids.extend(example_id)
        content = content.as_in_context(ctx)
        token_types = token_types.as_in_context(ctx)
        valid_len = valid_len.as_in_context(ctx)
        label = label.as_in_context(ctx)

        output = model(content, token_types, valid_len)
        predict = nd.argmax(nd.softmax(output, axis=-1), axis=-1)
        label = label.as_in_context(ctx)
        tp_s = int(nd.sum(nd.equal(predict, label)).asscalar())
        fp_s = int(
            nd.sum(nd.not_equal(predict, label) *
                   nd.equal(label, 0)).asscalar())
        fn_s = int(
            nd.sum(nd.not_equal(predict, label) *
                   nd.equal(label, 1)).asscalar())
        TP_s += tp_s
        FP_s += fp_s
        FN_s += fn_s

    P_s = TP_s / (TP_s + FP_s)
    R_s = TP_s / (TP_s + FN_s)
    F = (2 * P_s * R_s) / (P_s + R_s)

    logger.info("F:{}".format(F))
    return F
Ejemplo n.º 24
0
    def _predict_tabular_data(self, new_data, process=True, predict_proba=True):  # TODO ensure API lines up with tabular.Model class.
        """ Specific TabularNN method to produce predictions on new (unprocessed) data.
            Returns 1D numpy array unless predict_proba=True and task is multi-class classification (not binary).
            Args:
                new_data (pd.Dataframe or TabularNNDataset): new data to make predictions on.
                If you want to make prediction for just a single row of new_data, pass in: new_data.iloc[[row_index]]
                process (bool): should new data be processed (if False, new_data must be TabularNNDataset)
                predict_proba (bool): should we output class-probabilities (not used for regression)
        """
        if process:
            new_data = self.process_test_data(new_data, batch_size=self.batch_size, num_dataloading_workers=self.num_dataloading_workers_inference, labels=None)
        if not isinstance(new_data, TabularNNDataset):
            raise ValueError("new_data must of of type TabularNNDataset if process=False")
        if self.problem_type == REGRESSION or not predict_proba:
            preds = nd.zeros((new_data.num_examples,1))
        else:
            preds = nd.zeros((new_data.num_examples, self.num_net_outputs))
        i = 0
        for batch_idx, data_batch in enumerate(new_data.dataloader):
            data_batch = new_data.format_batch_data(data_batch, self.ctx)
            preds_batch = self.model(data_batch)
            batch_size = len(preds_batch)
            if self.problem_type != REGRESSION:
                if not predict_proba: # need to take argmax
                    preds_batch = nd.argmax(preds_batch, axis=1, keepdims=True)
                else: # need to take softmax
                    preds_batch = nd.softmax(preds_batch, axis=1)
            preds[i:(i+batch_size)] = preds_batch
            i = i+batch_size
        if self.problem_type == REGRESSION or not predict_proba:
            return preds.asnumpy().flatten()  # return 1D numpy array
        elif self.problem_type == BINARY and predict_proba:
            return preds[:,1].asnumpy()  # for binary problems, only return P(Y==+1)

        return preds.asnumpy()  # return 2D numpy array
Ejemplo n.º 25
0
 def act(self, stochastic, input_):
     value, logits = self.forward(input_)
     if stochastic:
         action = nd.sample_multinomial(nd.softmax(logits))
     else:
         action = nd.argmax(logits, axis=-1).astype('int32')
     return action, value
Ejemplo n.º 26
0
 def __call__(self, output, label):
     output = nd.softmax(output).asnumpy()
     label = label.asnumpy().astype('int').reshape((-1, ))
     output[range(label.shape[0]), label] = 1
     label = nd.array(output).argmin(axis=1).one_hot(
         self.num_class).astype('float32')
     return label
Ejemplo n.º 27
0
    def forward(self, query, values, head=False):
        """

        计算Attention权重与输出向量

        :param query: 查询,即当前步Decoder的输入
        :param values: 值,即Encoder中每一个时间步向量
        :return: (Attention输出向量, Attention权重)
        """
        #print('In Attention')
        hidden_with_time_axis = nd.expand_dims(query, 1)
        #print('hidden_with_time:', hidden_with_time_axis.shape)
        score = self.V(
            nd.tanh(self.W1(values) + self.W2(hidden_with_time_axis)))
        #print('\t score:',score.shape)
        attention_weights = nd.softmax(score, axis=1)

        #print('\t attention_weight:', attention_weights.shape)
        #print('\t values:', values.shape)

        context_vector = attention_weights * values

        #print('\t mid_context_vector:',context_vector.shape)

        if head is True:
            context_vector = nd.sum(context_vector, axis=2)
        else:
            context_vector = nd.sum(context_vector, axis=1)
        # print('\t context',context_vector.shape)
        context_vector = nd.expand_dims(context_vector, axis=0)
        return context_vector, attention_weights
Ejemplo n.º 28
0
 def forward(self, am, bm, alpha_r, beta_r):
     av = self.inference_composition_a(
         am)  # (batch_size, seq_len, hidden*2) (32,45,600)
     bv = self.inference_composition_b(bm)
     max_pool_a = nd.max(av, axis=1)
     max_pool_b = nd.max(bv, axis=1)
     mean_pool_a = nd.mean(av, axis=1)
     mean_pool_b = nd.mean(bv, axis=1)
     weight_pool_weight_a = nd.softmax(self.weight_pooling_dense_a(alpha_r))
     weight_pool_weight_b = nd.softmax(self.weight_pooling_dense_b(beta_r))
     aw = nd.sum(weight_pool_weight_a * av, axis=1)
     bw = nd.sum(weight_pool_weight_b * bv, axis=1)
     out = self.final_mlp(
         nd.concat(max_pool_a, mean_pool_a, aw, max_pool_b, mean_pool_b,
                   bw))
     return out
Ejemplo n.º 29
0
def beam_search_translate(encoder, decoder, input_seq, max_length, ctx,
                          beam_size, in_vocab, out_vocab):
    in_tokens = input_seq.lower().split(' ')
    in_tokens += [EOS] + [PAD] * (max_length - len(in_tokens) - 1)
    enc_input = nd.array([in_vocab.to_indices(in_tokens)], ctx=ctx)
    enc_state = encoder.begin_state(batch_size=1, ctx=ctx)
    enc_output, enc_state = encoder(enc_input, enc_state)
    dec_input = nd.array([out_vocab.token_to_idx[BOS]], ctx=ctx)
    dec_state = decoder.begin_state(enc_state)
    output_tokens = []
    # the first character prediction
    dec_output, dec_state = decoder(dec_input, dec_state, enc_output)
    topk = nd.topk(dec_output, k=beam_size,
                   ret_typ='indices').asnumpy().astype('int32')
    for idx in topk[0]:
        score = nd.softmax(dec_output[0])[idx].asscalar()
        sample_output = predict_rest(encoder, decoder, input_seq, max_length,
                                     idx, dec_state, enc_output, score,
                                     in_vocab, out_vocab, ctx)
        output_tokens.append(sample_output)

    for idx in range(len(output_tokens)):
        output_tokens[idx][1] = math.log(output_tokens[idx][1]) / (len(
            output_tokens[idx][0])**0.75)
    return output_tokens
Ejemplo n.º 30
0
 def forward(self, a):
     B, L, H = a.shape
     tilde_a = self.f(a.reshape(B * L, H)).reshape(
         B, L, self.hidden_size)  # shape = [B, L1, H]
     e = nd.linalg.gemm2(A=tilde_a, B=tilde_a.transpose([0, 2, 1]))
     alpha = nd.linalg.gemm2(nd.softmax(e), tilde_a)
     return alpha
Ejemplo n.º 31
0
def get_inception_score(images, splits=10):
    """
    Inception_score function.
        The images will be divided into 'splits' parts, and calculate each inception_score separately,
        then return the mean and std of inception_scores of these parts.
    :param images: Images(num x c x w x h) that needs to calculate inception_score.
    :param splits:
    :return: mean and std of inception_score
    """
    assert (images.shape[1] == 3)

    # load inception model
    if inception_model is None:
        _init_inception()

    # resize images to adapt inception model(inceptionV3)
    if images.shape[2] != 299:
        images = resize(images, 299, 299)

    preds = []
    bs = 4
    n_batches = int(math.ceil(float(images.shape[0])/float(bs)))

    # to get the predictions/picture of inception model
    for i in range(n_batches):
        sys.stdout.write(".")
        sys.stdout.flush()
        inps = images[(i * bs):min((i + 1) * bs, len(images))]
        # inps size. bs x 3 x 299 x 299
        pred = nd.softmax(inception_model(inps))
        # pred size. bs x 1000
        preds.append(pred.asnumpy())

    # list to array
    preds = np.concatenate(preds, 0)
    scores = []

    # to calculate the inception_score each split.
    for i in range(splits):
        # extract per split image pred
        part = preds[(i * preds.shape[0] // splits):((i + 1) * preds.shape[0] // splits), :]
        kl = part * (np.log(part) - np.log(np.expand_dims(np.mean(part, 0), 0)))
        kl = np.mean(np.sum(kl, 1))
        scores.append(np.exp(kl))

    return np.mean(scores), np.std(scores)
Ejemplo n.º 32
0
    def train(ctx):
        if isinstance(ctx, mx.Context):
            ctx = [ctx]
        if opt.resume_params is '':
            net.initialize(mx.init.MSRAPrelu(), ctx=ctx)

        if opt.no_wd:
            for k, v in net.collect_params('.*beta|.*gamma|.*bias').items():
                v.wd_mult = 0.0

        trainer = gluon.Trainer(net.collect_params(), optimizer, optimizer_params)
        if opt.resume_states is not '':
            trainer.load_states(opt.resume_states)

        if opt.label_smoothing or opt.mixup:
            sparse_label_loss = False
        else:
            sparse_label_loss = True
        if distillation:
            L = gcv.loss.DistillationSoftmaxCrossEntropyLoss(temperature=opt.temperature,
                                                                 hard_weight=opt.hard_weight,
                                                                 sparse_label=sparse_label_loss)
        else:
            L = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=sparse_label_loss)

        best_val_score = 1

        for epoch in range(opt.resume_epoch, opt.num_epochs):
            tic = time.time()
            if opt.use_rec:
                train_data.reset()
            train_metric.reset()
            btic = time.time()

            for i, batch in enumerate(train_data):
                data, label = batch_fn(batch, ctx)

                if opt.mixup:
                    lam = np.random.beta(opt.mixup_alpha, opt.mixup_alpha)
                    if epoch >= opt.num_epochs - opt.mixup_off_epoch:
                        lam = 1
                    data = [lam*X + (1-lam)*X[::-1] for X in data]

                    if opt.label_smoothing:
                        eta = 0.1
                    else:
                        eta = 0.0
                    label = mixup_transform(label, classes, lam, eta)

                elif opt.label_smoothing:
                    hard_label = label
                    label = smooth(label, classes)

                if distillation:
                    teacher_prob = [nd.softmax(teacher(X.astype(opt.dtype, copy=False)) / opt.temperature) \
                                    for X in data]

                with ag.record():
                    outputs = [net(X.astype(opt.dtype, copy=False)) for X in data]
                    if distillation:
                        loss = [L(yhat.astype('float32', copy=False),
                                  y.astype('float32', copy=False),
                                  p.astype('float32', copy=False)) for yhat, y, p in zip(outputs, label, teacher_prob)]
                    else:
                        loss = [L(yhat, y.astype(opt.dtype, copy=False)) for yhat, y in zip(outputs, label)]
                for l in loss:
                    l.backward()
                trainer.step(batch_size)

                if opt.mixup:
                    output_softmax = [nd.SoftmaxActivation(out.astype('float32', copy=False)) \
                                    for out in outputs]
                    train_metric.update(label, output_softmax)
                else:
                    if opt.label_smoothing:
                        train_metric.update(hard_label, outputs)
                    else:
                        train_metric.update(label, outputs)

                if opt.log_interval and not (i+1)%opt.log_interval:
                    train_metric_name, train_metric_score = train_metric.get()
                    logger.info('Epoch[%d] Batch [%d]\tSpeed: %f samples/sec\t%s=%f\tlr=%f'%(
                                epoch, i, batch_size*opt.log_interval/(time.time()-btic),
                                train_metric_name, train_metric_score, trainer.learning_rate))
                    btic = time.time()

            train_metric_name, train_metric_score = train_metric.get()
            throughput = int(batch_size * i /(time.time() - tic))

            err_top1_val, err_top5_val = test(ctx, val_data)

            logger.info('[Epoch %d] training: %s=%f'%(epoch, train_metric_name, train_metric_score))
            logger.info('[Epoch %d] speed: %d samples/sec\ttime cost: %f'%(epoch, throughput, time.time()-tic))
            logger.info('[Epoch %d] validation: err-top1=%f err-top5=%f'%(epoch, err_top1_val, err_top5_val))

            if err_top1_val < best_val_score:
                best_val_score = err_top1_val
                net.save_parameters('%s/%.4f-imagenet-%s-%d-best.params'%(save_dir, best_val_score, model_name, epoch))
                trainer.save_states('%s/%.4f-imagenet-%s-%d-best.states'%(save_dir, best_val_score, model_name, epoch))

            if save_frequency and save_dir and (epoch + 1) % save_frequency == 0:
                net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, epoch))
                trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, epoch))

        if save_frequency and save_dir:
            net.save_parameters('%s/imagenet-%s-%d.params'%(save_dir, model_name, opt.num_epochs-1))
            trainer.save_states('%s/imagenet-%s-%d.states'%(save_dir, model_name, opt.num_epochs-1))
Ejemplo n.º 33
0
               'dog', 'frog', 'horse', 'ship', 'truck']

context = [mx.cpu()]

# Load Model
model_name = opt.model
pretrained = True if opt.saved_params == '' else False
kwargs = {'classes': classes, 'pretrained': pretrained}
net = get_model(model_name, **kwargs)

if not pretrained:
    net.load_parameters(opt.saved_params, ctx = context)

# Load Images
img = image.imread(opt.input_pic)

# Transform
transform_fn = transforms.Compose([
    transforms.Resize(32),
    transforms.CenterCrop(32),
    transforms.ToTensor(),
    transforms.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
])

img = transform_fn(img)
pred = net(img.expand_dims(0))

ind = nd.argmax(pred, axis=1).astype('int')
print('The input picture is classified to be [%s], with probability %.3f.'%
      (class_names[ind.asscalar()], nd.softmax(pred)[0][ind].asscalar()))
Ejemplo n.º 34
0
    def forward(self, inputs, target, next_word_history, cache_history, begin_state=None): # pylint: disable=arguments-differ
        """Defines the forward computation for cache cell. Arguments can be either
        :py:class:`NDArray` or :py:class:`Symbol`.

        Parameters
        ----------
        inputs: NDArray
            The input data
        target: NDArray
            The label
        next_word_history: NDArray
            The next word in memory
        cache_history: NDArray
            The hidden state in cache history


        Returns
        --------
        out: NDArray
            The linear interpolation of the cache language model
            with the regular word-level language model
        next_word_history: NDArray
            The next words to be kept in the memory for look up
            (size is equal to the window size)
        cache_history: NDArray
            The hidden states to be kept in the memory for look up
            (size is equal to the window size)
        """
        output, hidden, encoder_hs, _ = \
            super(self.lm_model.__class__, self.lm_model).\
                forward(inputs, begin_state)
        encoder_h = encoder_hs[-1].reshape(-3, -2)
        output = output.reshape(-1, self._vocab_size)

        start_idx = len(next_word_history) \
            if next_word_history is not None else 0
        next_word_history = nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
                                        for t in target], dim=0) if next_word_history is None \
            else nd.concat(next_word_history,
                           nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
                                       for t in target], dim=0), dim=0)
        cache_history = encoder_h if cache_history is None \
            else nd.concat(cache_history, encoder_h, dim=0)

        out = None
        softmax_output = nd.softmax(output)
        for idx, vocab_L in enumerate(softmax_output):
            joint_p = vocab_L
            if start_idx + idx > self._window:
                valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx]
                valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx]
                logits = nd.dot(valid_cache_history, encoder_h[idx])
                cache_attn = nd.softmax(self._theta * logits).reshape(-1, 1)
                cache_dist = (cache_attn.broadcast_to(valid_next_word.shape)
                              * valid_next_word).sum(axis=0)
                joint_p = self._lambdas * cache_dist + (1 - self._lambdas) * vocab_L

            out = joint_p[target[idx]] if out is None \
                else nd.concat(out, joint_p[target[idx]], dim=0)
        next_word_history = next_word_history[-self._window:]
        cache_history = cache_history[-self._window:]
        return out, next_word_history, cache_history, hidden
Ejemplo n.º 35
0
################################### predict ###################################
h5f5 = h5py.File('features/test_resnet152_v1.h5', 'r')  # train_resnet152_v1  train_inceptionv31
h5f6 = h5py.File('features/test_inceptionv3.h5', 'r')  # train_resnet152_v1  train_inceptionv31
features3 = h5f5['features']
features4 = h5f6['features']

train_imgs = gluon.data.vision.ImageFolderDataset( './data/train_valid_test/Images')
ids = sorted(os.listdir('./data/train_valid_test/test/unknown'))
#print(ids)
#exit()
test_count = 10357
outputs = []
for i in range(test_count):
    features = np.concatenate([features3[i:i+1], features4[i:i+1]], axis=-1)
    predict = net(nd.array(features).as_in_context(ctx))
    output = nd.softmax(predict)
    #print(output)
    #exit()
    outputs.extend(output.asnumpy())

with open('submission.csv', 'w') as f:
    f.write('id,' + ','.join(train_imgs.synsets) + '\n')
    for i, output in zip(ids, outputs):
        f.write(i.split('.')[0] + ',' + ','.join([str(num) for num in output]) + '\n')

'''
#################################### train ####################################
#zong shuju  20580
h5f = h5py.File('features/train_resnet152_v1.h5', 'r')  # train_resnet152_v1  train_inceptionv31
h5f2 = h5py.File('features/train_inceptionv31.h5', 'r')  # train_resnet152_v1  train_inceptionv31
h5f3 = h5py.File('features/labels1.h5', 'r') 
Ejemplo n.º 36
0
    def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None):
        """Run decoding

        Parameters
        ----------
        word_inputs : mxnet.ndarray.NDArray
            word indices of seq_len x batch_size
        tag_inputs : mxnet.ndarray.NDArray
            tag indices of seq_len x batch_size
        arc_targets : mxnet.ndarray.NDArray
            gold arc indices of seq_len x batch_size
        rel_targets : mxnet.ndarray.NDArray
            gold rel indices of seq_len x batch_size
        Returns
        -------
        tuple
            (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training, else if given gold target
        then return arc_accuracy, rel_accuracy, overall_accuracy, outputs, otherwise return outputs, where outputs is a
        list of (arcs, rels).
        """
        is_train = autograd.is_training()

        def flatten_numpy(ndarray):
            """Flatten nd-array to 1-d column vector

            Parameters
            ----------
            ndarray : numpy.ndarray
                input tensor

            Returns
            -------
            numpy.ndarray
                A column vector

            """
            return np.reshape(ndarray, (-1,), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))  # non padding, non root token number

        if is_train or arc_targets is not None:
            mask_1D = flatten_numpy(mask)
            mask_1D_tensor = nd.array(mask_1D)

        unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK)
        word_embs = self.word_embs(nd.array(unked_words, dtype='int'))
        if self.pret_word_embs:
            word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs))
        tag_embs = self.tag_embs(nd.array(tag_inputs))

        # Dropout
        emb_inputs = nd.concat(word_embs, tag_embs, dim=2)  # seq_len x batch_size

        top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size,
                           dropout_x=self.dropout_lstm_input if is_train else 0)
        top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp)

        W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data()
        W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data()
        dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu(nd.dot(top_recur, W_head.T) + b_head)
        dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0],
                                                                                       p=self.dropout_mlp)
        dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1])
        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = self.arc_W.data()
        arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1,
                              bias_x=True, bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size))
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.argmax(0)
        # seq_len x batch_size

        if is_train or arc_targets is not None:
            correct = np.equal(arc_preds.asnumpy(), arc_targets)
            arc_correct = correct.astype(np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = flatten_numpy(arc_targets)
            losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D))
            arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            arc_probs = np.transpose(
                np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(), (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = self.rel_W.data()
        rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size,
                              num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size))
        # (#head x rel_size) x (#dep x batch_size)

        _target_vec = nd.array(targets_1D if is_train else flatten_numpy(arc_preds.asnumpy())).reshape(
            seq_len * batch_size, 1)
        _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size))

        partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0)
        # (rel_size) x (#dep x batch_size)

        if is_train or arc_targets is not None:
            rel_preds = partial_rel_logits.argmax(0)
            targets_1D = flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D))
            rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]), axis=0).asnumpy(),
                                                (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if is_train or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if is_train:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs