Example #1
0
def test_swapaxes_0d_1d_operands():
    x1 = C.input_variable(())
    with pytest.raises(ValueError):
        swapaxes_0d = C.swapaxes(x1)

    x2 = C.input_variable(2)
    with pytest.raises(ValueError):
        swapaxes_1d = C.swapaxes(x2)
Example #2
0
def test_swapaxes_0d_1d_operands():
    x1 = C.input_variable(())
    with pytest.raises(ValueError):
        swapaxes_0d = C.swapaxes(x1)

    x2 = C.input_variable(2)
    with pytest.raises(ValueError):
        swapaxes_1d = C.swapaxes(x2)
Example #3
0
    def model(seq_image, decoded):
        params = dense(decoded)
        g_x, g_y, sigma2, delta, gamma = attention_parameters(params)

        i = C.Constant(np.arange(n) + 1, )  # col of patch
        j = C.Constant(np.arange(n) + 1, )  # row of patch
        mu_x = g_x + (i - n / 2 - 0.5) * delta
        mu_y = g_y + (j - n / 2 - 0.5) * delta
        mu_x = C.expand_dims(mu_x, axis=-1)
        mu_y = C.expand_dims(mu_y, axis=-1)
        # mu_x: [#, *] [n, 1]
        # mu_y: [#, *] [n, 1]

        image = C.sequence.unpack(seq_image,
                                  padding_value=0,
                                  no_mask_output=True)
        # image: [#] [*image_width, filters, image_height]

        width_pos = Cx.sequence.position(seq_image)
        # width_pos: [#, *] [1]

        width_pos_unpacked = C.sequence.unpack(width_pos,
                                               padding_value=999_999,
                                               no_mask_output=True)
        # width_pos: [#] [*image_width, 1]

        a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x)
        # a: [#, *] [1, *image_width]
        # x pos index of image (width)

        b = C.Constant(np.arange(image_height).reshape((1, -1)))
        # b: [] [1, image_height]
        # y pos index of image (height)

        # calculate the which portion of the image that is attended by the gaussian filter
        f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2)
        f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2)
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        z_x = C.reduce_sum(f_xi, axis=1)
        z_y = C.reduce_sum(f_yj, axis=1)
        # z_x: [#, *] [n]
        # z_y: [#, *] [n]

        f_xi = f_xi / z_x
        f_yj = f_yj / z_y
        # f_xi: [#, *] [n, *image_width]
        # f_yj: [#, *] [n, image_height]

        # combine filters from x and y
        image_broadcasted = C.sequence.broadcast_as(image, f_yj)
        attended = gamma * C.times(
            f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2)
        # attended: [#, *] [n, filters, n]
        attended = C.swapaxes(attended)
        # attended: [#, *] [filters, n (x) , n (y)]
        return attended
Example #4
0
def cumsum(x, axis=0):
    dim = x.shape[axis]
    print('dim')
    print(dim)
    U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype))
    print('U')
    print(U)
    if axis != -1:
        x = C.swapaxes(x, -1, axis)
        print('swapped')
        print(x())
    out = C.times(x, U)
    if axis != -1:
        out = C.swapaxes(out, -1, axis)
    return out
Example #5
0
    def attention(encoded, network):
        abk = dense(network)
        a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures)
        # print("abk shape:", a.shape, b.shape, k.shape)
        # a, b, k: [#, n] [nb_mixture, 1]
        # context: [#, c] [char_ohe]

        encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True)
        # context_unpacked: [#] [*=c, char_ohe]
        u = Cx.sequence.position(encoded)  # position gives shape=(1, )
        # u: [#, c], [1]
        u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs
        # u_values: [#] [*=c, 1]
        # u_valid: [#] [*=c]
        u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k))
        # u_values_broadcast: [#, n] [1, *=c]
        u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k)
        # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point

        # print("u_values_broadcast shape:", u_values_broadcast.shape)
        # print("abk shape:", a.shape, b.shape, k.shape)
        phi = window_weight(a, b, k, u_values_broadcast)
        # phi: [#, n] [*=c, 1]
        zero = C.constant(0)
        phi = C.element_select(u_valid_broadcast, phi, zero, name="phi")
        # phi: [#, n] [*=c, 1]
        attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0)
        # [#, n] [1, char_ohe]
        # print("attended_context shape:", attended_context.shape)
        output = C.squeeze(attended, name="GaussianWindowAttention")
        # [#, n] [char_ohe]
        return output
Example #6
0
    def scale_dot_product_attention_block(self, contextQ, contextV, contextK,
                                          name):

        Q = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])
        V = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])
        K = C.placeholder(shape=(2 * self.hidden_dim, ),
                          dynamic_axes=[self.b_axis, self.q_axis])

        Ql = C.layers.Dense(100)(Q)
        Vl = C.layers.Dense(100)(V)
        Kl = C.layers.Dense(100)(K)

        kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs
        vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs
        KT = C.swapaxes(kvw)

        S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1)
        kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql)
        S = C.softmax(
            C.element_select(kvw_mask_expanded, S, C.constant(-1e+30)))
        att = C.times(S, vvw)

        return C.as_block(att, [(Q, contextQ), (V, contextV),
                                (K, contextK)], 'sdp_attention_block' + name,
                          'sdp_attention_block' + name)
Example #7
0
    def window_weight(a, b, k, u):
        """
        Calculate Phi is the window weight of character seq at position u of time t.
        Function tested to be correct on 2018-25-02 using numpy equivalent

        math:
            phi = summation of mixtures { a * exp ( -b * (k - u) ^ 2 ) }

        Args:
            a: importance of window within the mixture. Not normalised and doesn't sum to one.
            b: width of attention window
            k: location of window
            u: integer position of each item in sequence. Value from 1 to seq_length. (rank 2 tensor) [-3, 1]

        Returns:
            :class:`~cntk.ops.functions.Function`

        """
        # print(f"k shape: {k.shape}, u shape: {u.shape}")
        phi = a * C.exp(-1 * b * C.square(k - u))
        # print("internal phi shape:", phi.shape)
        phi = C.swapaxes(C.reduce_sum(phi,
                                      axis=0))  # Reduce sum the mixture axis
        # phi: [#, n] [*-c, 1]
        return phi
Example #8
0
def cumsum(x, axis: int = -1):
    """ Calculates the cumulative sum across a static axis

    Arguments:
        x: input tensor
        axis (int): static axis of tensor to cumsum over

    Returns:
        :class:`~cntk.ops.functions.Function`
    """
    d = x.shape[axis]
    u = C.constant(np.triu(np.ones((d, d))).astype(x.dtype))
    if axis != -1:
        x = C.swapaxes(x, -1, axis)
    z = C.times(x, u)
    if axis != -1:
        z = C.swapaxes(z, -1, axis)
    return z
Example #9
0
def swapaxes(t, axis1, axis2):
    if K.backend() == "cntk":
        return C.swapaxes(t, axis1=(axis1 - 1),
                          axis2=(axis2 - 1))  # 0, 3, 2, 1, 4
    else:
        swap = np.arange(K.ndim(t))
        swap[axis1] = axis2
        swap[axis2] = axis1
        return K.permute_dimensions(t, swap)
Example #10
0
    def attention_layer(self, context, query, layer):

        q_processed = C.placeholder(shape=(2*self.hidden_dim,))
        p_processed = C.placeholder(shape=(2*self.hidden_dim,))

        qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs

        wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform())
        wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform())
        v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform())

        # seq[tensor[2d]] p_len x 2d
        wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim))

        # q_len x 2d
        wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim))
        
        # seq[tensor[q_len]]
        S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1))

        qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed)

        # seq[tensor[q_len]]
        S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30))
        
        # seq[tensor[q_len]]
        A = C.softmax(S, axis=0)

        # seq[tensor[2d]]
        swap_qvw = C.swapaxes(qvw)
        cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1))

        # seq[tensor[4d]]
        uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq)
        
        # seq[tensor[4d]]
        gt = C.tanh(C.times(uc_concat, wg))
        
        # seq[tensor[4d]]
        uc_concat_star = gt * uc_concat
 
        # seq[tensor[4d]]
        vp = C.layers.Sequential([
            C.layers.Dropout(self.dropout),
            OptimizedRnnStack(self.hidden_dim, bidirectional=True, 
                use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star)
        
        return C.as_block(
            vp,
            [(p_processed, context), (q_processed, query)],
            'attention_layer',
            'attention_layer')
Example #11
0
    def group_lstm(dh, dc, x):
        x_grps = split(x, groups).outputs
        dh_grps = split(dh, groups).outputs
        dc_grps = split(dc, groups).outputs

        h_grps = []
        c_grps = []

        for lstm, h_grp, c_grp, x_grp in zip(lstms, dh_grps, dc_grps, x_grps):
            h, c = lstm(h_grp, c_grp, x_grp).outputs
            h_grps.append(h)
            c_grps.append(c)

        # inter-group correlation through permutation of dimensions
        h_output = C.reshape(
            C.swapaxes(C.splice(*h_grps, axis=C.Axis.new_leading_axis())),
            (shape, ))
        c_output = C.reshape(
            C.swapaxes(C.splice(*c_grps, axis=C.Axis.new_leading_axis())),
            (shape, ))

        return h_output, c_output
Example #12
0
    def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0):
        #W=tf.transpose(W, [0,2,3,1])
        
        arrs=array.shape
        ashp=W.shape
        sb=(W.shape[1],1,1)
        WV=W.shape[-2:]
        xi=(-2,-1)
        x2=(-2,-1,-3)

        if V:
            print(W.eval())
            print(arrs,ashp)
        mul=(array*W)

        if V:
            print('Wsamp',W[-1,-1].eval())
            print('array*w',(mul.eval())[0,-1])

        size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel)

        if V:
            print("sizesamp",size.shape,size.eval())
        if B is None:
            B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel
        B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))]))
        if sizz==1:
            mean=C.reduce_sum(mul,axis=xi)/size
        else:
            mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32)
        if V:
            print("meansamp",mean.eval()[0,-1])
        if square:
            i=(C.square(mul-mean)+B)
        else:
            i=(((mul)-mean)+B)
        di=i/size
        if V==2:
            print("i",i.eval(),"i")
            print("di",di.eval(),"di")
        if V:
            print('isamp',i.shape,i.eval()[-1,-1,])
        out=C.reduce_sum(i+B,axis=x2)
        #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1)
        print(out.shape)
        if sqrt:
            out=C.sqrt(out)
        out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1)
        print(out.shape)
        assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2])
        return(out)
Example #13
0
    def model(query, key, value):
        q = phi(query_linear(query))
        k = phi(key_linear(key))
        v = value_linear(value)

        # key and value should have the same sequence length
        k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True)
        # k_unpacked: [#] [*kv=, model_dim]
        v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True)
        # v_unpacked: [#] [*kv=, hidden_dim]
        kv = C.times(C.swapaxes(k_unpacked), v_unpacked)
        # kv [#] [model_dim, hidden_dim]
        kv_broadcasted = C.sequence.broadcast_as(kv, q)  # this can be reused across queries
        # kv [#, *] [model_dim, hidden_dim]

        numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted))
        # numerator [#, *] [hidden_dim, ]
        denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q))
        # denom [#, *] [1]

        return numerator / denom
Example #14
0
def test_TransposeAxes(tmpdir):
    data = [[[0, 1], [2, 3], [4, 5]]]
    model = C.swapaxes(data, 1, 2)
    verify_no_input(model, tmpdir, 'TransposeAxes_0')
Example #15
0
def test_TransposeAxes(tmpdir, dtype):
    with C.default_options(dtype=dtype):
        data = np.array([[[0, 1], [2, 3], [4, 5]]]).astype(dtype)
        model = C.swapaxes(data, 1, 2)
        verify_no_input(model, tmpdir, 'TransposeAxes_0')
Example #16
0
def test_TransposeAxes(tmpdir, dtype):
    with C.default_options(dtype = dtype):
        data = np.array([[[0,1],[2,3],[4,5]]]).astype(dtype)
        model = C.swapaxes(data, 1, 2)
        verify_no_input(model, tmpdir, 'TransposeAxes_0')