Beispiel #1
0
def test_squeeze(operand_shape, axis, device_id, precision):
    operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f')
    expected = np.squeeze(operand, axis)

    expected_forward = [expected]
    expected_backward = {
        'arg': [np.ones_like(operand)],
    }

    from .. import squeeze, placeholder
    p = C.placeholder()
    squeeze_with_axis = C.squeeze(p, axis)
    _test_unary_op(precision, device_id, squeeze_with_axis, operand,
                   expected_forward, expected_backward)
Beispiel #2
0
    def model(query, key, value):
        q = phi(query_linear(query))
        k = phi(key_linear(key))
        v = value_linear(value)

        # key and value should have the same sequence length
        k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True)
        # k_unpacked: [#] [*kv=, model_dim]
        v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True)
        # v_unpacked: [#] [*kv=, hidden_dim]
        kv = C.times(C.swapaxes(k_unpacked), v_unpacked)
        # kv [#] [model_dim, hidden_dim]
        kv_broadcasted = C.sequence.broadcast_as(kv, q)  # this can be reused across queries
        # kv [#, *] [model_dim, hidden_dim]

        numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted))
        # numerator [#, *] [hidden_dim, ]
        denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q))
        # denom [#, *] [1]

        return numerator / denom
Beispiel #3
0
    def __init__(self, p, eps=1e-7):
        if isinstance(p, (C.Variable, C.Function)):
            self.p = C.squeeze(p)
        else:
            self.p = C.Constant(np.squeeze(p))

        self.eps = C.Constant(eps, name='eps')
        self.c = self.p.shape[0]

        self.prob = self.p / (self.eps + C.reduce_sum(self.p))
        self.logits = C.log(self.prob)
        self.accum_prob = self.prob @ C.Constant(
            (1 - np.tri(self.prob.shape[-1], k=-1)))

        p_log_p = self.logits * self.prob
        self._entropy = -C.reduce_sum(p_log_p)

        dist = C.input_variable(1, name='category index')
        # method 1
        self._log_prob = C.log(
            C.reduce_sum(self.prob * C.one_hot(dist, self.c)))
Beispiel #4
0
    def inner(a):
        # a: [#, *] [static_axes, num_classes]

        k_values, k_indices = C.top_k(a, k=k, axis=axis).outputs
        # k_indices [#, *] [static_axes, k]

        b = C.one_hot(k_indices, num_classes)
        # b: [#, *] [static_axes, k, num_classes]

        valid_probabilities = C.squeeze(C.reduce_sum(b, axis=-2), axes=(-2, ))
        # valid_probabilities: [#, *] [static_axes, num_classes]

        # k largest probabilies are retained, everything else is set to -inf and will not be sampled
        minus_inf = C.constant(-1e+30)
        d = a * valid_probabilities
        e = C.element_select(d, d, minus_inf)
        # e: [#, *] [static_axes, num_classes]

        # sample from top_k distribution once
        s = sample(e, axis=axis, name=name)
        # s: [#, *] [static_axes, num_classes]
        return s
Beispiel #5
0
    def attention(encoded, network):
        abk = dense(network)
        a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures)
        # print("abk shape:", a.shape, b.shape, k.shape)
        # a, b, k: [#, n] [nb_mixture, 1]
        # context: [#, c] [char_ohe]

        encoded_unpacked = C.sequence.unpack(encoded,
                                             padding_value=0,
                                             no_mask_output=True)
        # context_unpacked: [#] [*=c, char_ohe]
        u = Cx.sequence.position(encoded)  # position gives shape=(1, )
        # u: [#, c], [1]
        u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs
        # u_values: [#] [*=c, 1]
        # u_valid: [#] [*=c]
        u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k))
        # u_values_broadcast: [#, n] [1, *=c]
        u_valid_broadcast = C.sequence.broadcast_as(
            C.reshape(u_valid, (1, ), 1), k)
        # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point

        # print("u_values_broadcast shape:", u_values_broadcast.shape)
        # print("abk shape:", a.shape, b.shape, k.shape)
        phi = window_weight(a, b, k, u_values_broadcast)
        # phi: [#, n] [*=c, 1]
        zero = C.constant(0)
        phi = C.element_select(u_valid_broadcast, phi, zero, name="phi")
        # phi: [#, n] [*=c, 1]
        attended = C.reduce_sum(phi *
                                C.sequence.broadcast_as(encoded_unpacked, phi),
                                axis=0)
        # [#, n] [1, char_ohe]
        # print("attended_context shape:", attended_context.shape)
        output = C.squeeze(attended, name="GaussianWindowAttention")
        # [#, n] [char_ohe]
        return output
Beispiel #6
0
def crossentropy(y, t):
    prob = C.squeeze(C.reduce_sum(y * t, axis=0), 0)
    return -C.reduce_mean(C.unpack_batch(C.log(prob)))
Beispiel #7
0
def forward(x):
    y = C.times(x, theta1) + C.squeeze(bias1, 0)
    y = C.element_max(y, 0.)
    return C.times(y, theta2) + C.squeeze(bias2, 0)
Beispiel #8
0

def crossentropy(y, t):
    prob = C.squeeze(C.reduce_sum(y * t, axis=0), 0)
    return -C.reduce_mean(C.unpack_batch(C.log(prob)))


y = crossentropy(softmax(forward(x)), t)

batch_size = 20
for i in range(min(dataset_size, 100000) // batch_size):
    lr = 0.5 * (.1**(max(i - 100, 0) // 1000))
    sample = X[batch_size * i:batch_size * (i + 1)]
    target = labels[batch_size * i:batch_size * (i + 1)]
    g = y.grad({x: sample, t: target}, wrt=[theta1, bias1, theta2, bias2])
    for param, grad in g.items():
        param.value = param.value - grad * lr
    loss = y.eval({x: sample, t: target})
    print("cost {} - learning rate {}".format(loss, lr))

y = C.squeeze(C.argmax(forward(x), 0), 0)
accuracy = 0
for i in range(1000):
    sample = X[batch_size * i:batch_size * (i + 1)]
    target = labels[batch_size * i:batch_size * (i + 1)]
    tt = y.eval({x: sample})
    accuracy += np.sum(tt == np.argmax(target, axis=1))

print("Accuracy", accuracy / 1000. / batch_size)
# accuracy 99.36
Beispiel #9
0
 def _(t, batch_dim=0):
     return cntk.squeeze(cntk.flatten(t, axis=batch_dim))
Beispiel #10
0
 def inner(a):
     b = C.reshape(a, (n, -1))
     return tuple(C.squeeze(b[i]) for i in range(n))
Beispiel #11
0
 def sample(self, n=1):
     samples = C.random.uniform((n, 1))
     indcies = C.argmax(C.greater(self.accum_prob - samples, 0), axis=1)
     return C.squeeze(indcies)
Beispiel #12
0
def gpt2_self_attention(token_dims: int,
                        head_dims: int,
                        mask_opt: bool = False,
                        as_block: bool = False,
                        name: str = 'self_attention'):
    X = C.placeholder(token_dims,
                      dynamic_axes=(C.Axis.default_batch_axis(),
                                    C.Axis.default_dynamic_axis()),
                      name=name)

    # q = C.layers.Dense(token_dims, name=name+'_q')(X)
    # k = C.layers.Dense(token_dims, name=name+'_k')(X)
    # v = C.layers.Dense(token_dims, name=name+'_v')(X)

    # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w')
    # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv')

    qkv = C.layers.Dense((3, token_dims), name='qkv')(X)
    q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2]

    q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q')
    k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k')
    v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v')

    #region split multi head attention
    q_heads = [
        C.squeeze(q_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    k_heads = [
        C.squeeze(k_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    v_heads = [
        C.squeeze(v_mh[i], name='simgle_head_q' + str(i))
        for i in range(head_dims)
    ]
    #endregion

    attention_head = []
    for i in range(head_dims):
        q = q_heads[i]
        k = k_heads[i]
        v = v_heads[i]

        #region score
        # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present
        q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i))  # q seq
        k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i))  # k seq
        v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i))  # v seq

        scores = C.times_transpose(q_, k_)
        scaled = scores * (1 / C.sqrt(v_.shape[-1]))

        #region mask opt
        mask = triangular_matrix_seq(2)(X)
        inf_mask = -np.inf * (mask - 0.5)
        inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask')

        scaled = C.element_min(scaled, inf_mask)
        #endregion

        softmax = C.softmax(scaled)
        #endregion
        #region sum
        attention = C.times(softmax, v_)
        attention_seq = C.to_sequence_like(attention, X)
        #endregion
        attention_head.append(attention_seq)


#region merge attention heads
    attention = C.splice(*attention_head, name='merged_attention')
    #endergion

    #region project
    project = C.layers.Dense(token_dims, name='project')(attention)
    #endregion

    if as_block:
        return C.as_block(project, [(X, X)], 'gpt2_self_attention',
                          'gpt2_self_attention')

    return project