Beispiel #1
0
def simple_rnn(inputs, state, temperature=1.0):
    outputs = []
    h = state
    for X in inputs:
        h_linear = nd.dot(X, Wxh) + nd.dot(h, Whh) + bh
        h = nd.tanh(h_linear)
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h)
Beispiel #2
0
def lstm(_inputs, initial_state_h, initial_state_c, *parameters):
    # _inputs: a list with length num_steps,
    # corresponding element: batch_size * input_dim matrix

    H = initial_state_h  # hidden state
    C = initial_state_c  # memory cell

    [W_xi, W_hi, b_i,
     W_xf, W_hf, b_f,
     W_xo, W_ho, b_o,
     W_xc, W_hc, b_c,
     W_hy, b_y] = parameters

    _outputs = []

    for X in _inputs:
        # compute INPUT gate from input and last/initial hidden state
        input_gate = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i)
        # compute FORGET gate from input and last/initial hidden state
        forget_gate = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f)
        # compute OUTPUT gate from input and last/initial hidden state
        output_gate = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o)
        # compute memory cell candidate from input and last/initial hidden state
        memory_cell_candidate = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c)
        # compute memory cell from last memory cell and memory cell candidate
        C = forget_gate * C + input_gate * memory_cell_candidate
        # compute hidden state from output gate and memory cell
        H = output_gate * nd.tanh(C)
        # compute output from hidden state
        Y = nd.dot(H, W_hy) + b_y
        _outputs.append(Y)

    return _outputs, H, C
Beispiel #3
0
def perceptron(w,b,x,y):
    if (y * (nd.dot(w,x) + b)).asscalar() <= 0:
        w += y * x
        b += y
        return 1
    else:
        return 0
Beispiel #4
0
def gru(_inputs, initial_state, *parameters):
    # _inputs: a list with length num_steps,
    # corresponding element: batch_size * input_dim matrix

    H = initial_state

    [W_xz, W_hz, b_z,
     W_xr, W_hr, b_r,
     W_xh, W_hh, b_h,
     W_hy, b_y] = parameters

    _outputs = []

    for X in _inputs:
        # compute update gate from input and last/initial hidden state
        update_gate = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z)
        # compute reset gate from input and last/initial hidden state
        reset_gate = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r)
        # compute candidate hidden state from input, reset gate and last/initial hidden state
        H_candidate = nd.tanh(nd.dot(X, W_xh) + reset_gate * nd.dot(H, W_hh) + b_h)
        # compute hidden state from candidate hidden state and last hidden state
        H = update_gate * H + (1 - update_gate) * H_candidate
        # compute output from hidden state
        Y = nd.dot(H, W_hy) + b_y
        _outputs.append(Y)

    return _outputs, H
Beispiel #5
0
def plotscore(w, d):
    xgrid = np.arange(-3, 3, 0.02)
    ygrid = np.arange(-3, 3, 0.02)
    xx, yy = np.meshgrid(xgrid, ygrid)
    zz = nd.zeros(shape=(xgrid.size, ygrid.size, 2))
    zz[:, :, 0] = nd.array(xx)
    zz[:, :, 1] = nd.array(yy)
    vv = nd.dot(zz, w) + b
    CS = plt.contour(xgrid, ygrid, vv.asnumpy())
    plt.clabel(CS, inline=1, fontsize=10)
Beispiel #6
0
    def _spectral_norm(self):
        """ spectral normalization """
        w = self.params.get('weight').data(self.ctx)
        w_mat = nd.reshape(w, [w.shape[0], -1])

        _u = self.u.data(self.ctx)
        _v = None

        for _ in range(POWER_ITERATION):
            _v = nd.L2Normalization(nd.dot(_u, w_mat))
            _u = nd.L2Normalization(nd.dot(_v, w_mat.T))

        sigma = nd.sum(nd.dot(_u, w_mat) * _v)
        if sigma == 0.:
            sigma = EPSILON

        self.params.setattr('u', _u)

        return w / sigma
Beispiel #7
0
def batchwise_covariance(X, Y):
        meanx = meany = vary = n = C = 0
        for x, y in zip(X, Y):
            m = len(x)
            meanx_ = x.mean(axis=0, keepdims=True)
            meany_ = y.mean(axis=0, keepdims=True)
            dx = x - meanx_
            dy = y - meany_

            C_ = nd.dot(dx, dy, transpose_a=True)
            C += C_ + nd.dot((meanx - meanx_), (meany - meany_), transpose_a=True) * n * m / (n+m)

            vary_ = nd.sum(dy**2, axis=0)
            vary += vary_ + ((meany - meany_)**2) * n * m / (n+m)

            meanx = (n * meanx + m * meanx_) / (n+m)
            meany = (n * meany + m * meany_) / (n+m)
            n += m
        return C / n, vary / n
Beispiel #8
0
def lstm_rnn(inputs, h, c, temperature=1.0):
    outputs = []
    for X in inputs:
        g = nd.tanh(nd.dot(X, Wxg) + nd.dot(h, Whg) + bg)
        i = nd.sigmoid(nd.dot(X, Wxi) + nd.dot(h, Whi) + bi)
        f = nd.sigmoid(nd.dot(X, Wxf) + nd.dot(h, Whf) + bf)
        o = nd.sigmoid(nd.dot(X, Wxo) + nd.dot(h, Who) + bo)
        #######################
        #
        #######################
        c = f * c + i * g
        h = o * nd.tanh(c)
        #######################
        #
        #######################
        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h, c)
Beispiel #9
0
def gru_rnn(inputs, h, temperature=1.0):
    outputs = []
    for X in inputs:
        z = nd.sigmoid(nd.dot(X, Wxz) + nd.dot(h, Whz) + bz)
        r = nd.sigmoid(nd.dot(X, Wxr) + nd.dot(h, Whr) + br)
        g = nd.tanh(nd.dot(X, Wxh) + nd.dot(r * h, Whh) + bh)
        h = z * h + (1 - z) * g

        yhat_linear = nd.dot(h, Why) + by
        yhat = softmax(yhat_linear, temperature=temperature)
        outputs.append(yhat)
    return (outputs, h)
Beispiel #10
0
def cov_reg_it(self, x, y, mean_x, mean_xy, num_x, num_y, regime):
    cond_y = regime(y)
    # number of times each sample's x for w.t dot x was inside the regime
    num_n = cond_y.sum(axis=1, keepdims=True)
    # => weighted sum over x
    wsum_x = nd.dot(num_n, x, transpose_a=True)

    # y's in regime
    reg_y = y * cond_y
    # sum of xy's in regime
    # sum_xy = (reg_y.expand_dims(axis=2) * x.expand_dims(axis=1)).sum(axis=0)
    sum_xy = nd.dot(reg_y, x, transpose_a=True)


    num_x_cur = num_n.sum()
    mean_x = (num_x * mean_x + wsum_x) / (num_x + num_x_cur + 1e-12)

    num_y_cur = cond_y.sum(axis=0)
    mean_xy = (num_y.T * mean_xy + sum_xy) / (num_y + num_y_cur + 1e-12).T

    num_y += num_y_cur

    return mean_x, mean_xy, num_y
Beispiel #11
0
 def hue(src, delta, p=0.5):
     """Hue distortion"""
     if np.random.uniform(0, 1) > p:
         alpha = np.random.uniform(-delta, delta)
         u = np.cos(alpha * np.pi)
         w = np.sin(alpha * np.pi)
         bt = np.array([[1.0, 0.0, 0.0], [0.0, u, -w], [0.0, w, u]])
         tyiq = np.array([[0.299, 0.587, 0.114], [0.596, -0.274, -0.321],
                          [0.211, -0.523, 0.311]])
         ityiq = np.array([[1.0, 0.956, 0.621], [1.0, -0.272, -0.647],
                           [1.0, -1.107, 1.705]])
         t = np.dot(np.dot(ityiq, bt), tyiq).T
         src = nd.dot(src, nd.array(t, ctx=src.context))
         return src
     return src
Beispiel #12
0
 def forward(self, x):
     x = self.dense(x)
     # Use the constant parameters created, as well as the relu and dot
     # functions of NDArray
     x = nd.relu(nd.dot(x, self.rand_weight.data()) + 1)
     # Reuse the fully connected layer. This is equivalent to sharing
     # parameters with two fully connected layers
     x = self.dense(x)
     # Here in Control flow, we need to call asscalar to return the scalar
     # for comparison
     while x.norm().asscalar() > 1:
         x /= 2
     if x.norm().asscalar() < 0.8:
         x *= 10
     return x.sum()
def evaluate_FITB_accuracy(data_loader: AsyncDataLoader, model):
    '''
    Measures the accuracy of the model in indicating the correct variable
    '''
    with data_loader as data_loader:
        correct = 0
        for split_batch, batch_length in tqdm(data_loader,
                                              total=data_loader.total_batches):
            batches_outputs = [(batch, model(batch.data))
                               for batch in split_batch]
            for batch, output in batches_outputs:
                predictions_labels = model.unbatchify(batch, output)
                for prediction, label in predictions_labels:
                    correct += int(nd.dot(prediction, label).asscalar())
    return correct / len(data_loader)
Beispiel #14
0
def net(X, Verbose=False):
    X = X.as_in_context(W1.context)  #将X的存储位置与W1一致
    #第一层卷积
    h1_conv = nd.Convolution(data=X,
                             weight=W1,
                             bias=b1,
                             kernel=W1.shape[2:],
                             num_filter=W1.shape[0])
    h1_activation = nd.relu(h1_conv)
    h1 = nd.Pooling(data=h1_activation,
                    pool_type="max",
                    kernel=(2, 2),
                    stride=(2, 2))
    #第二层卷积
    h2_conv = nd.Convolution(data=h1,
                             weight=W2,
                             bias=b2,
                             kernel=W2.shape[2:],
                             num_filter=W2.shape[0])
    h2_activation = nd.relu(h2_conv)
    h2 = nd.Pooling(data=h2_activation,
                    pool_type="max",
                    kernel=(2, 2),
                    stride=(2, 2))
    h2 = h2.flatten()
    #第三层全链接
    h3 = nd.relu(nd.dot(h2, W3) + b3)
    #第四层全链接
    h4 = nd.dot(h3, W4) + b4
    if Verbose:
        print('1st conv block:', h1.shape)
        print('2nd conv block:', h2.shape)
        print('1st dense:', h3.shape)
        print('2nd dense:', h4.shape)
        print('output:', h4)
    return h4
Beispiel #15
0
    def forward(self, x, spatial_attention):
        '''
        Chebyshev graph convolution operation

        Parameters
        ----------
        x: mx.ndarray, graph signal matrix
           shape is (batch_size, N, F, T_{r-1}), F is the num of features

        spatial_attention: mx.ndarray, shape is (batch_size, N, N)
                           spatial attention scores

        Returns
        ----------
        mx.ndarray, shape is (batch_size, N, self.num_of_filters, T_{r-1})

        '''
        (batch_size, num_of_vertices, num_of_features,
         num_of_timesteps) = x.shape

        self.Theta.shape = (self.K, num_of_features, self.num_of_filters)
        self.Theta._finish_deferred_init()

        outputs = []
        for time_step in range(num_of_timesteps):
            # shape is (batch_size, V, F)
            graph_signal = x[:, :, :, time_step]
            output = nd.zeros(shape=(batch_size, num_of_vertices,
                                     self.num_of_filters),
                              ctx=x.context)
            for k in range(self.K):

                # shape of T_k is (V, V)
                T_k = self.cheb_polynomials[k]

                # shape of T_k_with_at is (batch_size, V, V)
                T_k_with_at = T_k * spatial_attention

                # shape of theta_k is (F, num_of_filters)
                theta_k = self.Theta.data()[k]

                # shape is (batch_size, V, F)
                rhs = nd.batch_dot(T_k_with_at.transpose((0, 2, 1)),
                                   graph_signal)

                output = output + nd.dot(rhs, theta_k)
            outputs.append(output.expand_dims(-1))
        return nd.relu(nd.concat(*outputs, dim=-1))
Beispiel #16
0
    def lstm_rnn(self, inputs, h, c, temperature=1.0):
        outputs = []
        for X in inputs:
            # if not X.shape[0] == 77:
            #     continue
            X = nd.one_hot(X, 60)
            #print("X.shape",X.shape,self.Wxg.shape,self.Whg.shape,h.shape)
            g = nd.tanh(nd.dot(X, self.Wxg) + nd.dot(h, self.Whg) + self.bg)
            i = nd.sigmoid(nd.dot(X, self.Wxi) + nd.dot(h, self.Whi) + self.bi)
            f = nd.sigmoid(nd.dot(X, self.Wxf) + nd.dot(h, self.Whf) + self.bf)
            o = nd.sigmoid(nd.dot(X, self.Wxo) + nd.dot(h, self.Who) + self.bo)

            c = f * c + i * g
            h = o * nd.tan(c)

            yhat_linear = nd.dot(h, self.Why) + self.by
            yhat = self.softmax(yhat_linear, temperature=temperature)
            #yhat = mx.ndarray.softmax(yhat_linear,temperature=temperature)
            outputs.append(yhat)

        return (outputs, h, c)
 def __getTwoCross(self, X):
     batch = 0
     t = None
     for x in tqdm(X):
         s = 0
         for j1 in range(len(x)):
             for j2 in range(j1 + 1, len(x)):
                 s += (nd.dot(self.bw[j1], self.bw[j2]) * x[j1] *
                       x[j2]).asscalar()
         s = nd.array([[s]], dtype='float64')
         if batch == 0:
             t = nd.array(s)
         else:
             t = nd.concat(t, s, dim=0)
         batch += 1
     return t
Beispiel #18
0
    def forward(self, x):
        root = next(iter(self._structure.items()))[0]

        if (len(self._routerlayer) > 0):
            router_d, embedd_d = self._contextify(x)(root)

            embedd = nd.stack(*[embedd_d[key] for key in sorted(embedd_d)],
                              axis=0)
            router = nd.stack(*[router_d[key] for key in sorted(router_d)],
                              axis=-1)

            return nd.dot(router, embedd)

        else:
            head = nd.ones_like(nd.slice_axis(x, axis=1, begin=0, end=None))
            return self._contextify(x)(root) * head
Beispiel #19
0
def stats_batchwise(x_bat,
                    y_bat,
                    n,
                    x_mean,
                    y_mean,
                    x_var=None,
                    y_var=None,
                    xx_cov=None,
                    yy_cov=None,
                    xy_cov=None,
                    x_mean_skip=False,
                    y_mean_skip=False):
    m = x_bat.shape[0]

    x_bat_mean = x_bat.mean(axis=0, keepdims=True)
    y_bat_mean = y_bat.mean(axis=0, keepdims=True)

    dx = x_bat - x_bat_mean
    dy = y_bat - y_bat_mean

    if x_var is not None:
        x_bat_var = nd.sum(dx**2, axis=0)
        x_var += x_bat_var + ((x_mean - x_bat_mean)**2) * n * m / (n + m)

    if y_var is not None:
        y_bat_var = nd.sum(dy**2, axis=0)
        y_var += y_bat_var + ((y_mean - y_bat_mean)**2) * n * m / (n + m)

    if xx_cov is not None:
        xx_bat_cov = nd.dot(dx, dx, transpose_a=True)
        xx_cov += xx_bat_cov + nd.dot(
            (x_mean - x_bat_mean),
            (x_mean - x_bat_mean), transpose_a=True) * n * m / (n + m)

    if yy_cov is not None:
        yy_bat_cov = nd.dot(dy, dy, transpose_a=True)
        yy_cov += yy_bat_cov + nd.dot(
            (y_mean - y_bat_mean),
            (y_mean - y_bat_mean), transpose_a=True) * n * m / (n + m)

    if xy_cov is not None:
        xy_bat_cov = nd.dot(dy, dx, transpose_a=True)
        xy_cov += xy_bat_cov + nd.dot(
            (y_mean - y_bat_mean),
            (x_mean - x_bat_mean), transpose_a=True) * n * m / (n + m)

    if not x_mean_skip:
        x_mean = (n * x_mean + m * x_bat_mean) / (n + m)

    if not y_mean_skip:
        y_mean = (n * y_mean + m * y_bat_mean) / (n + m)

    n += m

    return n, x_mean, y_mean, x_var, y_var, xx_cov, yy_cov, xy_cov
Beispiel #20
0
 def compute_vertex_layer(self, layer: int, vertex: int,
                          subgraph: Subgraph) -> NDArray:
     feature_sum = nd.zeros(shape=(self._feature_layers[layer -
                                                        1][vertex].size, 1),
                            ctx=data_ctx)
     for neighbor in self._graph.vertices[vertex].neighbors:
         prev = self._feature_layers[layer - 1][neighbor.id]
         prev_act = prev if layer == 1 else self._act(prev)
         feature_sum = feature_sum + prev_act / math.sqrt(
             subgraph.degree * neighbor.degree)
     res = self._b[layer - 1].data() + nd.dot(
         self._W[layer - 1].data(), feature_sum.as_in_context(model_ctx))
     res = res.as_in_context(data_ctx)
     return res \
         if not self._concatenate_features or layer == self._num_layers - 1 \
         else nd.concat(self._feature_layers[layer - 1][vertex], res, dim=0)
Beispiel #21
0
def gru_rnn(inputs, H, *params):
    # inputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵
    # H: 尺寸为 batch_size * hidden_dim 矩阵
    # outputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵
    W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hy, b_y = params
    outputs = []
    for X in inputs:
        Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z)
        R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r)
        H_tilda = nd.tanh(nd.dot(X, W_xh) + R * nd.dot(H, W_hh) + b_h)
        H = Z * H + (1 - Z) * H_tilda
        Y = nd.dot(H, W_hy) + b_y
        outputs.append(Y)
    return (outputs, H)
Beispiel #22
0
    def basis_message_func(self, edges):
        """Message function for basis regularizer"""
        ctx = edges.src['h'].context
        if self.num_bases < self.num_rels:
            # generate all weights from bases
            weight = self.weight.data(ctx).reshape(
                self.num_bases, self.in_feat * self.out_feat)
            weight = nd.dot(self.w_comp.data(ctx), weight).reshape(
                self.num_rels, self.in_feat, self.out_feat)
        else:
            weight = self.weight.data(ctx)

        msg = utils.bmm_maybe_select(edges.src['h'], weight, edges.data['type'])
        if 'norm' in edges.data:
            msg = msg * edges.data['norm']
        return {'msg': msg}
Beispiel #23
0
def load_data_linear_regression(true_w, true_b, num_train=1000, num_test=0):
    """
    """
    assert isinstance(true_w, list)
    assert isinstance(true_b, float)
    num_features = len(true_w)

    true_w = nd.array(true_w)
    true_b = nd.array([
        true_b,
    ])

    x = nd.random.normal(scale=1, shape=(num_train + num_test, num_features))
    y = nd.dot(x, true_w) + true_b
    y += nd.random.normal(scale=0.01, shape=y.shape)
    return x, y
Beispiel #24
0
 def forward(self, user_id, text, topics):
     user_word = self.emb_uw(user_id)
     word_emb = self.emb_word(text)
     topics_emb = self.emb_word(topics)
     topics_emb = nd.transpose(topics_emb, axes=(1,0))
     topics_emb = nd.reshape(topics_emb, (self.word_dim,self.topics_num,1))
     topics_emb = nd.dot(word_emb, topics_emb)
     topics_emb = nd.reshape(topics_emb, (self.batch_size,self.sentence_length,self.topics_num))
     topics_emb = nd.softmax(topics_emb,axis=2)
     topics_emb = self.mlp_topic(topics_emb)
     word_emb = self.mlp_word(word_emb)
     xw = nd.concat(user_word, word_emb, topics_emb, dim=1)
     xw_1 = self.mlp_w1(xw)
     xw_2 = self.mlp_w2(xw_1)
     res = self.mlp(xw_2)
     return res
Beispiel #25
0
    def gru(self, inputs, state):
        W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = self.params
        H = state
        outputs = []

        for X in inputs:
            Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z)
            R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r)
            H_tilda = nd.tanh(nd.dot(R * H, W_hh) + nd.dot(X, W_xh) + b_h)
            H = Z * H + (1 - Z) * H_tilda
            Y = nd.dot(H, W_hq) + b_q
            outputs.append(Y)

        return outputs, H
Beispiel #26
0
def bilinear(x, W, y, input_size, seq_len, batch_size, num_outputs=1, bias_x=False, bias_y=False):
    """Do xWy

    Parameters
    ----------
    x : NDArray
        (input_size x seq_len) x batch_size
    W : NDArray
        (num_outputs x ny) x nx
    y : NDArray
        (input_size x seq_len) x batch_size
    input_size : int
        input dimension
    seq_len : int
        sequence length
    batch_size : int
        batch size
    num_outputs : int
        number of outputs
    bias_x : bool
        whether concat bias vector to input x
    bias_y : bool
        whether concat bias vector to input y

    Returns
    -------
    output : NDArray
        [seq_len_y x seq_len_x if output_size == 1 else seq_len_y x num_outputs x seq_len_x]
        x batch_size
    """
    if bias_x:
        x = nd.concat(x, nd.ones((1, seq_len, batch_size)), dim=0)
    if bias_y:
        y = nd.concat(y, nd.ones((1, seq_len, batch_size)), dim=0)

    ny = input_size + bias_y
    # W: (num_outputs x ny) x nx
    lin = nd.dot(W, x)
    if num_outputs > 1:
        lin = reshape_fortran(lin, (ny, num_outputs * seq_len, batch_size))
    y = y.transpose([2, 1, 0])  # May cause performance issues
    lin = lin.transpose([2, 1, 0])
    blin = nd.batch_dot(lin, y, transpose_b=True)
    blin = blin.transpose([2, 1, 0])
    if num_outputs > 1:
        blin = reshape_fortran(blin, (seq_len, num_outputs, seq_len, batch_size))
    return blin
Beispiel #27
0
def lstm_rnn(inputs, state_h, state_c, *params):
    # inputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵
    # H: 尺寸为 batch_size * hidden_dim 矩阵
    # outputs: num_steps 个尺寸为 batch_size * vocab_size 矩阵
    [
        W_xi, W_hi, b_i, W_xf, W_hf, b_f, W_xo, W_ho, b_o, W_xc, W_hc, b_c,
        W_hy, b_y
    ] = params

    H = state_h
    C = state_c
    outputs = []
    for X in inputs:
        I = nd.sigmoid(nd.dot(X, W_xi) + nd.dot(H, W_hi) + b_i)
        F = nd.sigmoid(nd.dot(X, W_xf) + nd.dot(H, W_hf) + b_f)
        O = nd.sigmoid(nd.dot(X, W_xo) + nd.dot(H, W_ho) + b_o)
        C_tilda = nd.tanh(nd.dot(X, W_xc) + nd.dot(H, W_hc) + b_c)
        C = F * C + I * C_tilda
        H = O * nd.tanh(C)
        Y = nd.dot(H, W_hy) + b_y
        outputs.append(Y)
    return (outputs, H, C)
Beispiel #28
0
    def forward(self, inputs, state):
        """ forward function """
        h, c = state
        outputs = []
        for x in inputs:
            i = nd.sigmoid(
                nd.dot(x, self.w_xi) + nd.dot(h, self.w_hi) + self.b_i)
            f = nd.sigmoid(
                nd.dot(x, self.w_xf) + nd.dot(h, self.w_hf) + self.b_f)
            o = nd.sigmoid(
                nd.dot(x, self.w_xo) + nd.dot(h, self.w_ho) + self.b_o)

            c_tilda = nd.tanh(
                nd.dot(x, self.w_xc) + nd.dot(h, self.w_hc) + self.b_c)
            c = f * c + i * c_tilda
            h = o * c

            y = nd.dot(h, self.w_hq) + self.b_q
            outputs.append(y)

        y_hat = nd.concat(*outputs, dim=0)
        return y_hat, (h, c)
    def name_face(self, person_face):
        ## Name the face of a person based on the dataset
        face = self.model.get_input(person_face)
        if face is None:
            return None
        face = nd.array(self.model.get_feature(face), ctx=self.ctx)

        # Calculate the similarity between the known features and the current face feature
        sim = nd.dot(self.dataset, face)
        scores = {}
        for known_id, index in self.names.items():
            scores[known_id] = max(sim[index]).asnumpy()

        if max(scores.values()) > self.args.threshold_face:
            return max(scores, key=scores.get)
        else:
            return None
Beispiel #30
0
    def forward(self, adj, feat):
        r"""

        Description
        -----------
        Compute (Dense) Graph SAGE layer.

        Parameters
        ----------
        adj : mxnet.NDArray
            The adjacency matrix of the graph to apply SAGE Convolution on, when
            applied to a unidirectional bipartite graph, ``adj`` should be of shape
            should be of shape :math:`(N_{out}, N_{in})`; when applied to a h**o
            graph, ``adj`` should be of shape :math:`(N, N)`. In both cases,
            a row represents a destination node while a column represents a source
            node.
        feat : mxnet.NDArray or a pair of mxnet.NDArray
            If a mxnet.NDArray is given, the input feature of shape :math:`(N, D_{in})` where
            :math:`D_{in}` is size of input feature, :math:`N` is the number of nodes.
            If a pair of mxnet.NDArray is given, the pair must contain two tensors of shape
            :math:`(N_{in}, D_{in})` and :math:`(N_{out}, D_{in})`.

        Returns
        -------
        mxnet.NDArray
            The output feature of shape :math:`(N, D_{out})` where :math:`D_{out}`
            is size of output feature.
        """
        check_eq_shape(feat)
        if isinstance(feat, tuple):
            feat_src = self.feat_drop(feat[0])
            feat_dst = self.feat_drop(feat[1])
        else:
            feat_src = feat_dst = self.feat_drop(feat)
        adj = adj.astype(feat_src.dtype).as_in_context(feat_src.context)
        in_degrees = adj.sum(axis=1, keepdims=True)
        h_neigh = (nd.dot(adj, feat_src) + feat_dst) / (in_degrees + 1)
        rst = self.fc(h_neigh)
        # activation
        if self.activation is not None:
            rst = self.activation(rst)
        # normalization
        if self._norm is not None:
            rst = self._norm(rst)

        return rst
Beispiel #31
0
    def backward(self, DY):
        '''
        Backward-passes an input error gradient DY towards the input neurons of this layer.

        Parameters
        ----------

        DY :    mxnet.ndarray.ndarray.NDArray
                an error gradient shaped same as the output array of forward, i.e. (N,Hy,Wy,Dy) with
                N = number of samples in the batch
                Hy = heigth of the output
                Wy = width of the output
                Dy = output depth = input depth


        Returns
        -------

        DX :    mxnet.ndarray.ndarray.NDArray
                the error gradient propagated towards the input

        '''

        self.DY = DY
        N, Hy, Wy, NF = DY.shape
        hf, wf, df, NF = self.W.shape
        hstride, wstride = self.stride

        DX = nd.zeros_like(self.X, ctx=self.ctx, dtype=self.dtype)

        if not (hf == wf and self.stride == (1, 1)):
            for i in range(Hy):
                for j in range(Wy):
                    DX[:, i * hstride:i * hstride + hf,
                       j * wstride:j * wstride +
                       wf, :] += (nd.expand_dims(self.W, axis=0) *
                                  nd.expand_dims(
                                      DY[:, i:i + 1, j:j + 1, :], axis=3)).sum(
                                          axis=4)  #sum over all the filters
        else:
            for i in range(hf):
                for j in range(wf):
                    DX[:, i:i + Hy:hstride,
                       j:j + Wy:wstride, :] += nd.dot(DY, self.W[i, j, :, :].T)

        return DX  #* (hf*wf*df)**.5 / (NF*Hy*Wy)**.5
Beispiel #32
0
 def forward(self, x, *args):
     #傅里叶级数
     x = nd.array(x)
     n = self.n
     ns = nd.array(range(1, n))
     ns = ns.reshape((-1, 1))
     T = nd.dot(ns, x.reshape((1, -1)))
     pl = 2 * pi / self.l.data().abs()
     #
     an = self.an.data()
     bn = self.bn.data()
     san = an[1:].reshape((-1, 1))
     sbn = bn[1:].reshape((-1, 1))
     f = san * nd.cos(T * pl) + sbn * nd.sin(T * pl)
     f = nd.sum(f, axis=0, keepdims=False)
     f = f + an[0]
     return f.reshape((-1, 1))
def getfake(samples, dimensions, epsilon):
    wfake = nd.random_normal(shape=(dimensions))   # fake weight vector for separation
    bfake = nd.random_normal(shape=(1))            # fake bias
    wfake = wfake / nd.norm(wfake)                 # rescale to unit length

    # making some linearly separable data, simply by chosing the labels accordingly
    X = nd.zeros(shape=(samples, dimensions))
    Y = nd.zeros(shape=(samples))
    i = 0
    while (i < samples):
        tmp = nd.random_normal(shape=(1,dimensions))
        margin = nd.dot(tmp, wfake) + bfake
        if (nd.norm(tmp).asscalar() < 3) & (abs(margin.asscalar()) > epsilon):
            X[i,:] = tmp[0]
            Y[i] = 1 if margin.asscalar() > 0 else -1
            i += 1
    return X, Y
Beispiel #34
0
 def forward(self, inputs, state):
     """ forward function """
     h, = state
     outputs = []
     for x in inputs:
         z = nd.sigmoid(
             nd.dot(x, self.w_xz) + nd.dot(h, self.w_hz) + self.b_z)
         r = nd.sigmoid(
             nd.dot(x, self.w_xr) + nd.dot(h, self.w_hr) + self.b_r)
         h_tilda = nd.tanh(
             nd.dot(x, self.w_xh) + nd.dot(h, self.w_hh) + self.b_h)
         h = z * h + (1 - z) * h_tilda
         y = nd.dot(h, self.w_hq) + self.b_q
         outputs.append(y)
     y_hat = nd.concat(*outputs, dim=0)
     return y_hat, (h, )
Beispiel #35
0
    def function_set(self):
        self.__batch_y_hat = []
        for X in self.__batch_X:
            I = nd.sigmoid(
                nd.dot(X, self.__W_xi) + nd.dot(self.__state, self.__W_hi) +
                self.__b_i)
            F = nd.sigmoid(
                nd.dot(X, self.__W_xf) + nd.dot(self.__state, self.__W_hf) +
                self.__b_f)
            O = nd.sigmoid(
                nd.dot(X, self.__W_xo) + nd.dot(self.__state, self.__W_ho) +
                self.__b_o)
            C_tilda = nd.tanh(
                nd.dot(X, self.__W_xc) + nd.dot(self.__state, self.__W_hc) +
                self.__b_c)
            # 注意这里 C 同 state 一样
            self.__C = F * self.__C + I * C_tilda
            self.__state = O * nd.tanh(self.__C)
            self.__batch_y_hat.append(
                nd.dot(self.__state, self.__W_hy) + self.__b_y)
        self.__batch_y_hat = nd.concat(*self.__batch_y_hat, dim=0)

        return self.__batch_y_hat
Beispiel #36
0
def getfake(samples, dimensions, epsilon):
    wfake = nd.random_normal(shape=(dimensions))  # fake weight vector for separation
    bfake = nd.random_normal(shape=(1))  # fake bias
    wfake = wfake / nd.norm(wfake)  # rescale to unit length

    # making some linearly separable data, simply by chosing the labels accordingly
    X = nd.zeros(shape=(samples, dimensions))
    Y = nd.zeros(shape=(samples))

    i = 0
    while (i < samples):
        tmp = nd.random_normal(shape=(1, dimensions))
        margin = nd.dot(tmp, wfake) + bfake
        if (nd.norm(tmp).asscalar() < 3) & (abs(margin.asscalar()) > epsilon):
            X[i, :] = tmp
            Y[i] = 2 * (margin > 0) - 1
            i += 1
    return X, Y
Beispiel #37
0
def get_fake(samples, dimensions, epsilon):
    wfake = nd.random_normal(shape=(dimensions))
    bfake = nd.random_normal(shape=(1))
    wfake = wfake / nd.norm(wfake)

    X = nd.zeros(shape=(samples, dimensions))
    Y = nd.zeros(shape=(samples))

    i = 0
    while i < samples:
        tmp = nd.random_normal(shape=(1, dimensions))
        margin = nd.dot(tmp, wfake) + bfake
        if (nd.norm(tmp).asscalar() < 3) and (abs(
                margin.asscalar() > epsilon)):
            X[i, :] = tmp
            Y[i] = 1 if margin.ascalar() > 0 else -1
            i += 1
    return X, Y
Beispiel #38
0
 def solve_discrete_lv(self, mat1, mat2=None, is_full_matrix=True):
     p = (
         []
     )  # need to store as list for autograd won't let you append indices in same matrix
     p.append(self.p0)
     for n in range(self.num_time_steps -
                    1):  # element-wise vector division and multiplication
         # Compute Ap to generate synthetic data for the full rank matrix A
         if is_full_matrix:
             mat_vec_prod = nd.dot(mat1, p[n])
         else:
             mat_vec_prod = compute_mat_vec_prod(mat1, mat2, p[n])
         p.append((1 + self.r * (1 - mat_vec_prod / self.k)) * p[n])
         # concat puts in nd array of size num_ts*N
         # need to take size (N, num_ts) and transpose otherwise default is doing row major
         # and we need column major storing of the linear list p
     return (nd.concat(*p, dim=0).reshape(self.num_time_steps,
                                          self.num_time_series).T)
Beispiel #39
0
  def compute_gradients(self,
                        elbo: nd.NDArray,
                        data_batch: mx.io.DataBatch = None,
                        log_q_sum: nd.NDArray = None,
                        mode: str = 'train') -> None:
    """Compute gradients and assign them to variational parameters.

    Args:
      elbo: evidence lower bound that we maximize
      data_batch: minibatch of data with data indices as labels
      log_q_sum: sum of log probs of samples from variational distributions q.
    """
    cfg = self.gradient_config
    if cfg['estimator'] == 'pathwise':
      for block in self.sequential._children:
        for child_block in block._children:
          if hasattr(child_block, 'is_reparam'):
            assert child_block.is_reparam == True
    if len(self._point_mass_params) > 0 and mode == 'train':
      variables = [p.data() for p in self._point_mass_params]
      assert elbo.shape[-1] == cfg['batch_size']
      loss = nd.mean(-elbo, -1)
      point_mass_grads = autograd.grad(loss, variables, retain_graph=True)
      _assign_grads(self._point_mass_params, point_mass_grads)
    if cfg['estimator'] == 'pathwise':
        (-elbo).backward()
    elif cfg['estimator'] == 'score_function':
      variables = [param.repeated for param in self._score_params]
      score_functions = autograd.grad(log_q_sum, variables)
      mx.autograd.set_recording(False)
      score_grads = []
      for param, score_function in zip(self._score_params, score_functions):
        grad = _leave_one_out_gradient_estimator(score_function, -elbo)
        if 'emb' in param.name:
          # turns out the sparse implementation is not faster?!
          # data, label = data_batch
          # label = label.astype(np.int64)
          # grad = nd.sparse.row_sparse_array(
          #     grad, indices=label, shape=param.shape)
          # need to broadcast for embeddings
          one_hot = nd.one_hot(data_batch[1], depth=self.n_data)
          grad = nd.dot(one_hot, grad, transpose_a=True)
        score_grads.append(grad)
      _assign_grads(self._score_params, score_grads)
Beispiel #40
0
def gru(inputs, state, params):
    """
    @description:门控循环单元 
    @param {type} 
    @return: 
    """
    W_xz, W_hz, b_z, W_xr, W_hr, b_r, W_xh, W_hh, b_h, W_hq, b_q = params
    H, = state
    outputs = []
    for X in inputs:
        Z = nd.sigmoid(nd.dot(X, W_xz) + nd.dot(H, W_hz) + b_z)
        R = nd.sigmoid(nd.dot(X, W_xr) + nd.dot(H, W_hr) + b_r)
        H_tilda = nd.tanh(nd.dot(X, W_xh) + nd.dot(R * H, W_hh) + b_h)
        H = Z * H + (1 - Z) * H_tilda
        Y = nd.dot(H, W_hq) + b_q
        outputs.append(Y)
    return outputs, (H, )
Beispiel #41
0
 def hue(src, delta, p=0.5):
     """Hue distortion"""
     if np.random.uniform(0, 1) > p:
         alpha = random.uniform(-delta, delta)
         u = np.cos(alpha * np.pi)
         w = np.sin(alpha * np.pi)
         bt = np.array([[1.0, 0.0, 0.0],
                        [0.0, u, -w],
                        [0.0, w, u]])
         tyiq = np.array([[0.299, 0.587, 0.114],
                          [0.596, -0.274, -0.321],
                          [0.211, -0.523, 0.311]])
         ityiq = np.array([[1.0, 0.956, 0.621],
                           [1.0, -0.272, -0.647],
                           [1.0, -1.107, 1.705]])
         t = np.dot(np.dot(ityiq, bt), tyiq).T
         src = nd.dot(src, nd.array(t, ctx=src.context))
         return src
     return src
Beispiel #42
0
    def most_similar_to(self, word, k=5):
        '''
        Returns top k words similar to the argument.
        Eg.
        emb = Embedder(dimensions = 50)
        print(emb.most_similar_to('baby'))
        Returns...
            ['babies', 'boy', 'girl', 'newborn', 'pregnant']
        '''
        vec = self.__emb_mapper[word].reshape((-1, 1))
        emb_vecs = self.__norm_vecs_by_row(self.__emb_mapper.idx_to_vec)
        dot_product = nd.dot(emb_vecs, vec)
        indices = nd.topk(dot_product.reshape((len(self.__embedder), )),
                          k=k + 1,
                          ret_typ='indices')
        indices = [int(i.asscalar()) for i in indices]
        # Remove unknown and input tokens.

        return self.__embedder.to_tokens(indices[1:])
    def backward(self,DY):
        '''
        Backward-passes an input error gradient DY towards the input neurons of this layer.

        Parameters
        ----------

        DY :    mxnet.ndarray.ndarray.NDArray
                an error gradient shaped same as the output array of forward, i.e. (N,Hy,Wy,Dy) with
                N = number of samples in the batch
                Hy = heigth of the output
                Wy = width of the output
                Dy = output depth = input depth


        Returns
        -------

        DX :    mxnet.ndarray.ndarray.NDArray
                the error gradient propagated towards the input

        '''

        self.DY = DY
        N,Hy,Wy,NF = DY.shape
        hf,wf,df,NF = self.W.shape
        hstride, wstride = self.stride

        DX = nd.zeros_like(self.X,ctx=self.ctx, dtype=self.dtype)


        if not (hf == wf and self.stride == (1,1)):
            for i in range(Hy):
                for j in range(Wy):
                    DX[:,i*hstride:i*hstride+hf , j*wstride:j*wstride+wf , : ] += ( nd.expand_dims(self.W, axis=0) * nd.expand_dims(DY[:,i:i+1,j:j+1,:], axis=3) ).sum(axis=4)  #sum over all the filters
        else:
            for i in range(hf):
                for j in range(wf):
                    DX[:,i:i+Hy:hstride,j:j+Wy:wstride,:] += nd.dot(DY,self.W[i,j,:,:].T)

        return DX #* (hf*wf*df)**.5 / (NF*Hy*Wy)**.5
Beispiel #44
0
def stats_batchwise(x_bat, y_bat, n, x_mean, y_mean, x_var=None, y_var=None, xx_cov=None, yy_cov=None, xy_cov=None, x_mean_skip=False, y_mean_skip=False):
    m = x_bat.shape[0]

    x_bat_mean = x_bat.mean(axis=0, keepdims=True)
    y_bat_mean = y_bat.mean(axis=0, keepdims=True)

    dx = x_bat - x_bat_mean
    dy = y_bat - y_bat_mean

    if x_var is not None:
        x_bat_var = nd.sum(dx**2, axis=0)
        x_var += x_bat_var + ((x_mean - x_bat_mean)**2) * n * m / (n+m)

    if y_var is not None:
        y_bat_var = nd.sum(dy**2, axis=0)
        y_var += y_bat_var + ((y_mean - y_bat_mean)**2) * n * m / (n+m)

    if xx_cov is not None:
        xx_bat_cov = nd.dot(dx, dx, transpose_a=True)
        xx_cov += xx_bat_cov + nd.dot((x_mean - x_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m)

    if yy_cov is not None:
        yy_bat_cov = nd.dot(dy, dy, transpose_a=True)
        yy_cov += yy_bat_cov + nd.dot((y_mean - y_bat_mean), (y_mean - y_bat_mean), transpose_a=True) * n * m / (n+m)

    if xy_cov is not None:
        xy_bat_cov = nd.dot(dy, dx, transpose_a=True)
        xy_cov += xy_bat_cov + nd.dot((y_mean - y_bat_mean), (x_mean - x_bat_mean), transpose_a=True) * n * m / (n+m)

    if not x_mean_skip:
        x_mean = (n * x_mean + m * x_bat_mean) / (n+m)

    if not y_mean_skip:
        y_mean = (n * y_mean + m * y_bat_mean) / (n+m)

    n += m

    return n, x_mean, y_mean, x_var, y_var, xx_cov, yy_cov, xy_cov
Beispiel #45
0
 def forward(self, x):
     linear = nd.dot(x, self.weight.data()) + self.bias.data()
     return nd.relu(linear)
Beispiel #46
0
def linreg(X, w, b):
    """线性回归模型。"""
    return nd.dot(X, w) + b
Beispiel #47
0
def get_distance_matrix(x):
    """Get distance matrix given a matrix. Used in testing."""
    square = nd.sum(x ** 2.0, axis=1, keepdims=True)
    distance_square = square + square.transpose() - (2.0 * nd.dot(x, x.transpose()))
    return nd.sqrt(distance_square)
Beispiel #48
0
def linreg(X, w, b):
    """Linear regression."""
    return nd.dot(X, w) + b
Beispiel #49
0
#patterntest
import numpy as np
from mxnet import nd
from ecGAN.layer import Conv2D
from ecGAN.explain.pattern.estimator import estimators

lay = Conv2D(20, 2, strides=2, padding=0, regimes=estimators['linear']())
lay.initialize()

data = nd.random.normal(5,shape=[1000,3,8,8])
out = lay(data)
lay.init_pattern()
lay.collect_pparams().initialize()

for mdat in [data[i::100] for i in range(100)]:
    lay.forward_logged(mdat)
    lay.learn_pattern()
lay.compute_pattern()
resdat = data.reshape([1000,3,4,2,4,2]).transpose([0,2,4,1,3,5]).reshape([1000*4*4,3*4])
resout = out.transpose([0,2,3,1]).reshape([1000*4*4,20])
rescov = nd.dot((resout - resout.mean(0)).T, (resdat - resdat.mean(0))) / resout.shape[0]

#TODO check whether correlation is correct!
var_y = (lay.weight.data().flatten() * rescov).mean(1, keepdims=True)
std_y = (resout - resout.mean(0)).mean(0)
Beispiel #50
0
def test_dot():
    a = nd.ones(shape=(LARGE_X, SMALL_Y)) 
    b = nd.ones(shape=(SMALL_Y, SMALL_Y))
    res = nd.dot(a, b)
    assert np.sum(res[-1].asnumpy() == SMALL_Y) == b.shape[1]
Beispiel #51
0
def gram(x):
    c = x.shape[1]
    n = x.size / x.shape[1]
    y = x.reshape((c, int(n)))
    return nd.dot(y, y.T) / n
Beispiel #52
0
    def forward(self, inputs, target, next_word_history, cache_history, begin_state=None): # pylint: disable=arguments-differ
        """Defines the forward computation for cache cell. Arguments can be either
        :py:class:`NDArray` or :py:class:`Symbol`.

        Parameters
        ----------
        inputs: NDArray
            The input data
        target: NDArray
            The label
        next_word_history: NDArray
            The next word in memory
        cache_history: NDArray
            The hidden state in cache history


        Returns
        --------
        out: NDArray
            The linear interpolation of the cache language model
            with the regular word-level language model
        next_word_history: NDArray
            The next words to be kept in the memory for look up
            (size is equal to the window size)
        cache_history: NDArray
            The hidden states to be kept in the memory for look up
            (size is equal to the window size)
        """
        output, hidden, encoder_hs, _ = \
            super(self.lm_model.__class__, self.lm_model).\
                forward(inputs, begin_state)
        encoder_h = encoder_hs[-1].reshape(-3, -2)
        output = output.reshape(-1, self._vocab_size)

        start_idx = len(next_word_history) \
            if next_word_history is not None else 0
        next_word_history = nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
                                        for t in target], dim=0) if next_word_history is None \
            else nd.concat(next_word_history,
                           nd.concat(*[nd.one_hot(t[0], self._vocab_size, on_value=1, off_value=0)
                                       for t in target], dim=0), dim=0)
        cache_history = encoder_h if cache_history is None \
            else nd.concat(cache_history, encoder_h, dim=0)

        out = None
        softmax_output = nd.softmax(output)
        for idx, vocab_L in enumerate(softmax_output):
            joint_p = vocab_L
            if start_idx + idx > self._window:
                valid_next_word = next_word_history[start_idx + idx - self._window:start_idx + idx]
                valid_cache_history = cache_history[start_idx + idx - self._window:start_idx + idx]
                logits = nd.dot(valid_cache_history, encoder_h[idx])
                cache_attn = nd.softmax(self._theta * logits).reshape(-1, 1)
                cache_dist = (cache_attn.broadcast_to(valid_next_word.shape)
                              * valid_next_word).sum(axis=0)
                joint_p = self._lambdas * cache_dist + (1 - self._lambdas) * vocab_L

            out = joint_p[target[idx]] if out is None \
                else nd.concat(out, joint_p[target[idx]], dim=0)
        next_word_history = next_word_history[-self._window:]
        cache_history = cache_history[-self._window:]
        return out, next_word_history, cache_history, hidden
    def forward(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None):
        """Run decoding

        Parameters
        ----------
        word_inputs : mxnet.ndarray.NDArray
            word indices of seq_len x batch_size
        tag_inputs : mxnet.ndarray.NDArray
            tag indices of seq_len x batch_size
        arc_targets : mxnet.ndarray.NDArray
            gold arc indices of seq_len x batch_size
        rel_targets : mxnet.ndarray.NDArray
            gold rel indices of seq_len x batch_size
        Returns
        -------
        tuple
            (arc_accuracy, rel_accuracy, overall_accuracy, loss) when training, else if given gold target
        then return arc_accuracy, rel_accuracy, overall_accuracy, outputs, otherwise return outputs, where outputs is a
        list of (arcs, rels).
        """
        is_train = autograd.is_training()

        def flatten_numpy(ndarray):
            """Flatten nd-array to 1-d column vector

            Parameters
            ----------
            ndarray : numpy.ndarray
                input tensor

            Returns
            -------
            numpy.ndarray
                A column vector

            """
            return np.reshape(ndarray, (-1,), 'F')

        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]
        mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32)
        num_tokens = int(np.sum(mask))  # non padding, non root token number

        if is_train or arc_targets is not None:
            mask_1D = flatten_numpy(mask)
            mask_1D_tensor = nd.array(mask_1D)

        unked_words = np.where(word_inputs < self._vocab.words_in_train, word_inputs, self._vocab.UNK)
        word_embs = self.word_embs(nd.array(unked_words, dtype='int'))
        if self.pret_word_embs:
            word_embs = word_embs + self.pret_word_embs(nd.array(word_inputs))
        tag_embs = self.tag_embs(nd.array(tag_inputs))

        # Dropout
        emb_inputs = nd.concat(word_embs, tag_embs, dim=2)  # seq_len x batch_size

        top_recur = biLSTM(self.f_lstm, self.b_lstm, emb_inputs, batch_size,
                           dropout_x=self.dropout_lstm_input if is_train else 0)
        top_recur = nd.Dropout(data=top_recur, axes=[0], p=self.dropout_mlp)

        W_dep, b_dep = self.mlp_dep_W.data(), self.mlp_dep_b.data()
        W_head, b_head = self.mlp_head_W.data(), self.mlp_head_b.data()
        dep, head = leaky_relu(nd.dot(top_recur, W_dep.T) + b_dep), leaky_relu(nd.dot(top_recur, W_head.T) + b_head)
        dep, head = nd.Dropout(data=dep, axes=[0], p=self.dropout_mlp), nd.Dropout(data=head, axes=[0],
                                                                                       p=self.dropout_mlp)
        dep, head = nd.transpose(dep, axes=[2, 0, 1]), nd.transpose(head, axes=[2, 0, 1])
        dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:]
        head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:]

        W_arc = self.arc_W.data()
        arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1,
                              bias_x=True, bias_y=False)
        # (#head x #dep) x batch_size

        flat_arc_logits = reshape_fortran(arc_logits, (seq_len, seq_len * batch_size))
        # (#head ) x (#dep x batch_size)

        arc_preds = arc_logits.argmax(0)
        # seq_len x batch_size

        if is_train or arc_targets is not None:
            correct = np.equal(arc_preds.asnumpy(), arc_targets)
            arc_correct = correct.astype(np.float32) * mask
            arc_accuracy = np.sum(arc_correct) / num_tokens
            targets_1D = flatten_numpy(arc_targets)
            losses = self.softmax_loss(flat_arc_logits, nd.array(targets_1D))
            arc_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            arc_probs = np.transpose(
                np.reshape(nd.softmax(flat_arc_logits, axis=0).asnumpy(), (seq_len, seq_len, batch_size), 'F'))
        # #batch_size x #dep x #head

        W_rel = self.rel_W.data()
        rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size,
                              num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True)
        # (#head x rel_size x #dep) x batch_size

        flat_rel_logits = reshape_fortran(rel_logits, (seq_len, self._vocab.rel_size, seq_len * batch_size))
        # (#head x rel_size) x (#dep x batch_size)

        _target_vec = nd.array(targets_1D if is_train else flatten_numpy(arc_preds.asnumpy())).reshape(
            seq_len * batch_size, 1)
        _target_mat = _target_vec * nd.ones((1, self._vocab.rel_size))

        partial_rel_logits = nd.pick(flat_rel_logits, _target_mat.T, axis=0)
        # (rel_size) x (#dep x batch_size)

        if is_train or arc_targets is not None:
            rel_preds = partial_rel_logits.argmax(0)
            targets_1D = flatten_numpy(rel_targets)
            rel_correct = np.equal(rel_preds.asnumpy(), targets_1D).astype(np.float32) * mask_1D
            rel_accuracy = np.sum(rel_correct) / num_tokens
            losses = self.softmax_loss(partial_rel_logits, nd.array(targets_1D))
            rel_loss = nd.sum(losses * mask_1D_tensor) / num_tokens

        if not is_train:
            rel_probs = np.transpose(np.reshape(nd.softmax(flat_rel_logits.transpose([1, 0, 2]), axis=0).asnumpy(),
                                                (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F'))
        # batch_size x #dep x #head x #nclasses

        if is_train or arc_targets is not None:
            loss = arc_loss + rel_loss
            correct = rel_correct * flatten_numpy(arc_correct)
            overall_accuracy = np.sum(correct) / num_tokens

        if is_train:
            return arc_accuracy, rel_accuracy, overall_accuracy, loss

        outputs = []

        for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs):
            # parse sentences one by one
            msk[0] = 1.
            sent_len = int(np.sum(msk))
            arc_pred = arc_argmax(arc_prob, sent_len, msk)
            rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred]
            rel_pred = rel_argmax(rel_prob, sent_len)
            outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len]))

        if arc_targets is not None:
            return arc_accuracy, rel_accuracy, overall_accuracy, outputs
        return outputs