def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def test_lstm_over_lstm_thought_vectors_2(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 utterances_input = C.sequence.input_variable((input_vocab_size), is_sparse=True, name='utterances') conversation_lengths_input = C.input_variable((), name='conversation_sequence_lengths') label_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(utterances_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.user_function(UtteranceBatchReshape(model, conversation_lengths_input)) model = C.to_sequence_like(model, label_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_input) sentinel_utt_data = C.NDArrayView.from_csr(_to_csr([[0, 0, 1]]), device=C.cpu()) c1_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0], [1, 0, 0]]), device=C.cpu()) c1_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1]]), device=C.cpu()) c1_utt3_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1], [0, 1, 0]]), device=C.cpu()) c2_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 1]]), device=C.cpu()) c3_utt1_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0], [0, 1, 1], [1, 0, 0]]), device=C.cpu()) c3_utt2_data = C.NDArrayView.from_csr(_to_csr([[0, 1, 0]]), device=C.cpu()) all_utt_data = C.Value.create(C.sequence.input_variable((input_vocab_size), is_sparse=True), [c1_utt1_data, c1_utt2_data, c1_utt3_data, c2_utt1_data, sentinel_utt_data, sentinel_utt_data, c3_utt1_data, c3_utt2_data, sentinel_utt_data], device=C.cpu()).data conversation_lengths_data = np.asarray([3, 1, 2], dtype=np.float32) seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0]] seq3_label_data = [[1, 0], [0, 1]] label_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data), _to_csr(seq3_label_data)] param_grads, loss_result = ce.grad({utterances_input : all_utt_data, label_input : label_data, conversation_lengths_input : conversation_lengths_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.01 assert np.allclose(loss_result[0], [[0.678914], [0.668076], [0.728129]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.679029]], atol=absolute_tolerance) assert np.allclose(loss_result[2], [[0.705393], [0.674243]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors(device_id): dev = cntk_device(device_id) input_vocab_size=3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input_variable((C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input_variable(num_labels, is_sparse=True, sequence_axis=C.Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input_variable((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad({x_seq_input : x_seq_data, label_seq_input : label_seq_data}, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.67126], [0.676331], [0.765814]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.685199], [0.681736]], atol=absolute_tolerance)
def test_lstm_over_lstm_thought_vectors(device_id): previous_random_seed = C.cntk_py.get_random_seed() C.cntk_py.reset_random_seed(0) dev = cntk_device(device_id) input_vocab_size = 3 emb_dim = 2 hidden_dim = 2 num_labels = 2 x_seq_input = C.sequence.input((C.FreeDimension, input_vocab_size), is_sparse=True, name='features') label_seq_input = C.sequence.input(num_labels, is_sparse=True, sequence_axis=Axis('label_sequence'), name='labels') with C.default_options(initial_state=0.1): model = C.layers.Embedding(emb_dim, name='embed')(x_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.sequence.last(model) model = C.to_sequence_like(model, label_seq_input) model = C.layers.Recurrence(C.layers.LSTM(hidden_dim), go_backwards=False)(model) model = C.layers.Dense(num_labels, name='classify')(model) z = model ce = C.cross_entropy_with_softmax(z, label_seq_input) seq1_data = [[[0, 1, 1], [0, 1, 0], [1, 0, 0]], [[1, 1, 0], [0, 0, 1], [1, 0, 1]], [[1, 0, 0], [0, 0, 1], [1, 1, 0]]] csr_seq1 = _to_csr(seq1_data) ndarrayview1 = C.NDArrayView.from_csr(csr_seq1, shape=(3, 3, 3), device=C.cpu()) seq2_data = [[[0, 0, 1], [0, 1, 1], [1, 0, 1]], [[0, 1, 0], [1, 0, 1], [0, 0, 0]]] csr_seq2 = _to_csr(seq2_data) ndarrayview2 = C.NDArrayView.from_csr(csr_seq2, shape=(2, 3, 3), device=C.cpu()) x_seq_data = C.Value.create(C.sequence.input((3, 3), is_sparse=True), [ndarrayview1, ndarrayview2], device=C.cpu()).data seq1_label_data = [[0, 1], [0, 1], [1, 0]] seq2_label_data = [[1, 0], [0, 1]] label_seq_data = [_to_csr(seq1_label_data), _to_csr(seq2_label_data)] param_grads, loss_result = ce.grad( { x_seq_input: x_seq_data, label_seq_input: label_seq_data }, wrt=ce.parameters, outputs=[ce], as_numpy=False) loss_result = loss_result.as_sequences() # TODO: The tolerance here is inordinately high due to the non-determinism in initialization # of parameters as the individual tests are not run in separate processes resulting in the # addition or removal of tests to affect the random initialization of parameters in all other # tests that do not explicitly specify the random seed. The tolerance should be lowered to # 0.01 after this issue in the test infrastructure has been fixed. absolute_tolerance = 0.02 assert np.allclose(loss_result[0], [[0.63504], [0.673343], [0.698446]], atol=absolute_tolerance) assert np.allclose(loss_result[1], [[0.772344], [0.64295]], atol=absolute_tolerance) C.cntk_py.reset_random_seed(previous_random_seed)
def batchmatmul(left, right, output_rank=1, infer_input_rank_to_map=C.TIMES_NO_INFERRED_INPUT_RANK, name=''): """ Batch Matrix Multiplication The output of this operation is the matrix product of the two input batch matrices. This implementation is similar to tensorflow.matmul. Currently assumes the first axis to be the static batch axis. Does not accept multiple static batch axis. Example: a = C.sequence.input_variable((3, 4, 5)) # batch matrix b = C.sequence.input_variable((3, 5, 6)) # batch matrix c = Cx.batchmatmul(a, b) assert c.shape == (3, 4, 6) # 3 is treated as a batch axis a = C.sequence.input_variable((3, 4, 5)) # batch matrix b = C.sequence.input_variable((3, 5, 6, 7)) # batch tensor c = Cx.batchmatmul(a, b, output_rank=2) assert c.shape == (3, 4, 6, 7) # 3 is treated as a batch axis a = C.input_variable((3, 4, 5)) # batch matrix b = C.input_variable((3, 5, 6, 7)) # batch tensor c = Cx.batchmatmul(a, b, output_rank=2) assert c.shape == (3, 4, 6, 7) Arguments: left: left side matrix or tensor right: right side matrix or tensor output_rank (int): in case we have tensors as arguments, output_rank represents the number of axes to be collapsed in order to transform the tensors into matrices, perform the operation and then reshape back (explode the axes) infer_input_rank_to_map (int): meant for internal use only. Always use default value name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` """ left_shape = left.shape right_shape = right.shape seq_axis_present = len(left.dynamic_axes) == 2 static_batch_axis = left_shape[ 0] # assumes the first axis to be the static batch axis. if left_shape[0] != right_shape[0]: raise ValueError( "first axis of left operand and right operand must be the same") if (left_shape[0] < 0 or right_shape[0] < 0) and seq_axis_present: raise ValueError( "Static batch axis cannot be a free axis when dynamic sequence axis is also present" ) # Combine dynamic sequence axis and static batch axis if not seq_axis_present: left_unpacked = left right_unpacked = right else: left_unpacked = C.sequence.unpack(left, padding_value=0, no_mask_output=True) right_unpacked = C.sequence.unpack(right, padding_value=0, no_mask_output=True) left_unpacked = C.reshape(left_unpacked, (-1, ) + left_shape[1:]) right_unpacked = C.reshape(right_unpacked, (-1, ) + right_shape[1:]) # Fold static batch axis into dynamic sequence axis left_folded = C.to_sequence( left_unpacked ) # do not set sequence length as batch axis has been folded in right_folded = C.to_sequence_like( right_unpacked, left_folded ) # seq_length / axis set here to tell cntk they have the same seq axis # Matrix Multiply when no static batch axis is present result = C.times(left_folded, right_folded, output_rank=output_rank, infer_input_rank_to_map=infer_input_rank_to_map) # Split dynamic sequence axis back to original dynamic sequence and static batch axis result_unpacked = C.sequence.unpack(result, padding_value=0, no_mask_output=True) if not seq_axis_present: result_packed = C.reshape(result_unpacked, (static_batch_axis, ) + result.shape) else: result_unfolded = C.reshape(result_unpacked, (-1, static_batch_axis) + result.shape) result_packed = C.to_sequence_like(result_unfolded, left) return _inject_name(result_packed, name)
def attention_layer(self, context, query, dim): input_ph = C.placeholder(shape=(dim, )) input_mem = C.placeholder(shape=(dim, )) with C.layers.default_options(bias=False, activation=C.relu): attn_proj_enc = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] cln_mem_ph = C.placeholder() # [#,q][?=d] cln_inp_ph = C.placeholder() # [#,c][?=d] unpack_inputs, inputs_mask = C.sequence.unpack( cln_inp_ph, 0).outputs # [#][*=c,d] [#][*=c] expand_inputs = C.sequence.broadcast_as(unpack_inputs, cln_mem_ph) # [#,q][*=c,d] matrix = C.reshape( C.times_transpose(cln_mem_ph, expand_inputs) / (self.hidden_dim**0.5), (-1, )) # [#,q][*=c] matrix = C.element_select( C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix, C.constant(-1e30)) logits = C.softmax(matrix, axis=0, name='level 1 weight') # [#,q][*=c] trans_expand_inputs = C.transpose(expand_inputs, [1, 0]) # [#,q][d,*=c] q_over_c = C.reshape( C.reduce_sum(logits * trans_expand_inputs, axis=1), (-1, )) / (self.hidden_dim**0.5) # [#,q][d] new_q = C.splice(cln_mem_ph, q_over_c) # [#,q][2*d] # over unpack_matrix, matrix_mask = C.sequence.unpack( matrix, 0).outputs # [#][*=q,*=c] [#][*=q] inputs_mask_s = C.to_sequence(C.reshape(inputs_mask, (-1, 1))) # [#,c'][1] trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]), inputs_mask_s) # [#,c'][*=q] trans_matrix = C.sequence.gather(trans_matrix, inputs_mask_s) # [#,c2][*=q] trans_matrix = C.element_select( C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix, C.constant(-1e30)) logits2 = C.softmax(trans_matrix, axis=0, name='level 2 weight') # [#,c2][*=c] unpack_new_q, new_q_mask = C.sequence.unpack( new_q, 0).outputs # [#][*=q,2*d] [#][*=q] expand_new_q = C.transpose( C.sequence.broadcast_as(unpack_new_q, trans_matrix), [1, 0]) # [#,c2][2d,*=q] c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1), (-1, )) / (2 * self.hidden_dim)**0.5 # [#,c2][2d] c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph) weighted_q = c_over_q.clone(C.CloneMethod.share, { cln_mem_ph: memory_, cln_inp_ph: inputs_ }) # [#,c][2d] c2c = q_over_c.clone(C.CloneMethod.share, { cln_mem_ph: inputs_, cln_inp_ph: inputs_ }) # [#,c][2d] att_context = C.splice(input_ph, weighted_q, c2c) # 2d+2d+2d return C.as_block(att_context, [(input_ph, context), (input_mem, query)], 'attention_layer', 'attention_layer')
def self_attention_layer(in_dims: int, out_dims: int, name='self_attention', as_block: bool = False, k_ph: bool = False, v_ph: bool = False, mask_opt: bool = False) -> C.Function: sq_sa_dims = C.Constant(C.sqrt(out_dims).eval(), name='sq_dims') X = C.placeholder( in_dims, (C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name + '_ph') if k_ph is False and v_ph is False: q = C.layers.Dense(out_dims, name=name + '_q')( X ) # W_Q = C.parameter((in_dims, out_dims), init=init, name=name+'_q') k = C.layers.Dense(out_dims, name=name + '_k')( X ) # W_K = C.parameter((in_dims, out_dims), init=init, name=name+'_k') v = C.layers.Dense(out_dims, name=name + '_v')( X ) # W_V = C.parameter((in_dims, out_dims), init=init, name=name+'_v') elif k_ph is True and v_ph is True: q = C.layers.Dense(out_dims, name=name + '_q')(X) k = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_k_ph') v = C.placeholder(out_dims, (C.Axis.default_batch_axis(), C.Axis('kv_seq')), name=name + '_v_ph') else: raise Exception(f'k_ph:{k_ph}, v_ph:{v_ph}') q_ = C.sequence.unpack(q, 0, True, name=name + '_unpack_q') k_ = C.sequence.unpack(k, 0, True, name=name + '_unpack_k') v_ = C.sequence.unpack(v, 0, True, name=name + '_unpack_v') scores = C.times_transpose(q_, k_, name=name + '_score_matrix') scaled = scores / sq_sa_dims # div_k if mask_opt: mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) softmax = C.softmax(scaled, name=name + '_softmax') attention = C.times(softmax, v_, name=name + '_attention') result = C.to_sequence_like(attention, X) if as_block: if k_ph is False and v_ph is False: return C.as_block(result, [(X, X)], 'self_attention', 'self_attention_') elif k_ph is True and v_ph is True: return C.as_block(result, [(X, X), (k, k), (v, v)], 'self_attention', 'self_attention_') else: raise Exception(f'k_ph:{k_ph} v_ph:{v_ph}') else: return result
def gpt2_self_attention(token_dims: int, head_dims: int, mask_opt: bool = False, as_block: bool = False, name: str = 'self_attention'): X = C.placeholder(token_dims, dynamic_axes=(C.Axis.default_batch_axis(), C.Axis.default_dynamic_axis()), name=name) # q = C.layers.Dense(token_dims, name=name+'_q')(X) # k = C.layers.Dense(token_dims, name=name+'_k')(X) # v = C.layers.Dense(token_dims, name=name+'_v')(X) # attn_c_attn_w = C.parameter((token_dims,3*token_dims), name='attn_c_attn_w') # qkv = C.reshape(X@attn_c_attn_w, (3,-1), name='qkv') qkv = C.layers.Dense((3, token_dims), name='qkv')(X) q_seq, k_seq, v_seq = qkv[0], qkv[1], qkv[2] q_mh = C.reshape(q_seq, (head_dims, -1), name='multi_head_q') k_mh = C.reshape(k_seq, (head_dims, -1), name='multi_head_k') v_mh = C.reshape(v_seq, (head_dims, -1), name='multi_head_v') #region split multi head attention q_heads = [ C.squeeze(q_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] k_heads = [ C.squeeze(k_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] v_heads = [ C.squeeze(v_mh[i], name='simgle_head_q' + str(i)) for i in range(head_dims) ] #endregion attention_head = [] for i in range(head_dims): q = q_heads[i] k = k_heads[i] v = v_heads[i] #region score # q_ = C.sequence.last(q, name='last_q'+str(i)) # q present q_ = C.sequence.unpack(q, 0, True, name='seq_q' + str(i)) # q seq k_ = C.sequence.unpack(k, 0, True, name='seq_k' + str(i)) # k seq v_ = C.sequence.unpack(v, 0, True, name='seq_v' + str(i)) # v seq scores = C.times_transpose(q_, k_) scaled = scores * (1 / C.sqrt(v_.shape[-1])) #region mask opt mask = triangular_matrix_seq(2)(X) inf_mask = -np.inf * (mask - 0.5) inf_mask = C.as_block(inf_mask, [(X, X)], 'mask', 'mask') scaled = C.element_min(scaled, inf_mask) #endregion softmax = C.softmax(scaled) #endregion #region sum attention = C.times(softmax, v_) attention_seq = C.to_sequence_like(attention, X) #endregion attention_head.append(attention_seq) #region merge attention heads attention = C.splice(*attention_head, name='merged_attention') #endergion #region project project = C.layers.Dense(token_dims, name='project')(attention) #endregion if as_block: return C.as_block(project, [(X, X)], 'gpt2_self_attention', 'gpt2_self_attention') return project