def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide(C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output
def signFunc(self, arg): # create an input variable that matches the dimension of the input argument signIn = C.input(shape=arg.shape, dynamic_axes=arg.dynamic_axes) # create the first stage of the sign function, check if input is greater than zero actionfunc = C.greater(signIn, 0) # return the second stage of the sign function, replace any 0s with -1s return C.element_select(actionfunc, actionfunc, -1), signIn
def attention(query, key, value): dk = C.reduce_sum(C.ones_like(query)) # cannot use sequence.last, will conflict with recurrence # dk: [#, *] [1, ] and value = int(dim_of_query) unpacked_key = C.sequence.unpack(key, padding_value=0, no_mask_output=True) # [#] [-3, key_dim] unpacked_value = C.sequence.unpack(value, padding_value=0, no_mask_output=True) # [#] [-3, value_dim] broadcasted_key = C.sequence.broadcast_as(unpacked_key, query) # [#, *] [-3, key_dim] scaled = C.times_transpose(query, broadcasted_key) / dk # [#, *] [q_dim] @ [#, *] [key_dim, -3], assert q_dim == key_dim # scaled: [#, *] [-3, ] => for every key seq element, there is a corresponding score # masked out invalid temporal connections to obey_sequence_order if obey_sequence_order and max_seq_len: unpacked_scaled, scaled_mask = C.sequence.unpack(scaled, padding_value=0).outputs # unpacked_scaled: [#] [-3, -3] <== matrix will be top right diagonally zero-ed # scaled_mask: [#] [-3,] minus_inf = C.constant(-1e+30) valid_connections = C.Constant(np.tril(np.ones((max_seq_len, max_seq_len)), k=0)) # [] [max_seq, max_seq] valid_connections = C.reconcile_dynamic_axes(valid_connections, unpacked_scaled) # [#] [max_seq, max_seq] valid_connections = C.crop_manual(valid_connections, unpacked_scaled, 0, 0) # [#] [-3, -3] unpacked_scaled = C.element_select(valid_connections, unpacked_scaled, minus_inf) # [#] [-3, -3] scaled = C.to_sequence_like(unpacked_scaled, query) # [#, *] [-3] elif obey_sequence_order and not max_seq_len: raise ValueError("max_seq_len must be defined when obey_sequence_order is True") attended = C.times(C.softmax(scaled, axis=-1), C.sequence.broadcast_as(unpacked_value, query)) # [#, *] [value_dim,] return attended
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax( C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block(att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def simi_attention(self, input, memory): ''' return: memory weighted vectors over input [#,c][d] weight ''' input_ph = C.placeholder() # [#,c][d] mem_ph = C.placeholder() # [#,q][d] input_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) mem_dense = Dense(2 * self.hidden_dim, bias=False, input_rank=1) bias = C.Parameter(shape=(2 * self.hidden_dim, ), init=0.0) weight_dense = Dense(1, bias=False, input_rank=1) proj_inp = input_dense(input_ph) # [#,c][d] proj_mem = mem_dense(mem_ph) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( proj_mem, 0).outputs # [#][*=q, d] [#][*=q] expand_mem = C.sequence.broadcast_as(unpack_memory, proj_inp) # [#,c][*=q,d] expand_mask = C.sequence.broadcast_as(mem_mask, proj_inp) # [#,c][*=q] matrix = C.reshape(weight_dense(C.tanh(proj_inp + expand_mem + bias)), (-1, )) # [#,c][*=q] matrix = C.element_select(expand_mask, matrix, -1e30) logits = C.softmax(matrix, axis=0) # [#,c][*=q] weight_mem = C.reduce_sum(C.reshape(logits, (-1, 1)) * expand_mem, axis=0) # [#,c][d] weight_mem = C.reshape(weight_mem, (-1, )) return C.as_block(C.combine(weight_mem, logits), [(input_ph, input), (mem_ph, memory)], 'simi_attention', 'simi_attention')
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus( C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def multiFunc(self, arg1): multiIn = C.input(shape=arg1.shape, dynamic_axes = arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1,)) carry_over = multiIn approx = C.element_times(multiIn, 0) for i in range(max_bits): hot_vals = C.greater(bit_map, i) valid_vals = C.element_select(hot_vals, carry_over, 0) mean = C.element_divide(C.reduce_sum(C.abs(valid_vals)), C.reduce_sum(hot_vals)) bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) approx = C.plus(approx, C.element_times(mean, bits)) carry_over = C.plus(C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def inner_padded(x, y, p): padded, valid_x = _inner(x, y) # replace zero pad by scatter with padding token if p is not None: broadcasted_padding_token = C.sequence.broadcast_as(p, padded) padded = C.element_select(1 - valid_x, broadcasted_padding_token, padded) return padded # [*, long_seq] [short_seq_dim, ]
def inner(a): not_negative = C.greater_equal(a, 0) sign = C.element_select(not_negative, not_negative, -1) abs_x = C.abs(a) # A&S formula 7.1.26 t = 1.0 / (1.0 + p * a) y = 1.0 - (((((a5 * t + a4) * t) + a3) * t + a2) * t + a1) * t * C.exp( -abs_x * abs_x) return C.element_times(sign, y)
def test_Select(flag, if_true, if_false, tmpdir): flag = np.asarray(flag, dtype=np.float32) if_true = np.asarray(if_true, dtype=np.float32) if_false = np.asarray(if_false, dtype=np.float32) model = C.element_select(flag, if_true, if_false) verify_no_input(model, tmpdir, 'Select_0') flag_var = C.input_variable(np.shape(flag)) if_true_var = C.input_variable(np.shape(if_true)) if_false_var = C.input_variable(np.shape(if_false)) model = C.element_select(flag_var, if_true, if_false) verify_one_input(model, flag, tmpdir, 'Select_1_flag') model = C.element_select(flag, if_true_var, if_false) verify_one_input(model, if_true, tmpdir, 'Select_1_if_true') model = C.element_select(flag, if_true, if_false_var) verify_one_input(model, if_false, tmpdir, 'Select_1_if_false')
def true_density(z): z1, z2 = z[0], z[1] w1 = lambda x: C.sin(2 * np.pi * x/4) u = 0.5 * C.square((z2 - w1(z1))/0.4) dummy = C.ones_like(u) * 1e7 # u = C.element_select(C.less_equal(z1,4), u, dummy) cond = C.less_equal(z1,4) u = C.element_select(cond, u, dummy) # u = cond*u + (1-cond)*dummy return C.exp(-u)
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def multiFunc(self, arg1): # load or create the inputs we need multiIn = C.input(shape=arg1.shape, dynamic_axes=arg1.dynamic_axes) bit_map = C.constant(self.bit_map) max_bits = self.bit_map.max() shape = multiIn.shape reformed = C.reshape(multiIn, (-1, )) # lets compute the means we need # carry over represents the remaining value that needs to binarized. For a single bit, this is just the input. For more bits, # it is the difference between the previous bits approximation and the true value. carry_over = multiIn approx = C.element_times(multiIn, 0) # iterate through the maximum number of bits specified by the bit maps, basically compute each level of binarization for i in range(max_bits): # determine which values of the input should be binarized to i bits or more hot_vals = C.greater(bit_map, i) # select only the values which we need to binarize valid_vals = C.element_select(hot_vals, carry_over, 0) # compute mean on a per kernel basis, reshaping is done to allow for sum reduction along only axis 0 (the kernels) mean = C.element_divide( C.reduce_sum(C.reshape(C.abs(valid_vals), (valid_vals.shape[0], -1)), axis=1), C.reduce_sum(C.reshape(hot_vals, (hot_vals.shape[0], -1)), axis=1)) # reshape the mean to match the dimensionality of the input mean = C.reshape(mean, (mean.shape[0], mean.shape[1], 1, 1)) # binarize the carry over bits = C.greater(carry_over, 0) bits = C.element_select(bits, bits, -1) bits = C.element_select(hot_vals, bits, 0) # add in the equivalent binary representation to the approximation approx = C.plus(approx, C.element_times(mean, bits)) # compute the new carry over carry_over = C.plus( C.element_times(C.element_times(-1, bits), mean), carry_over) return approx, multiIn
def attention_layer(self, context, query): q_processed = C.placeholder(shape=(2 * self.hidden_dim, )) c_processed = C.placeholder(shape=(2 * self.hidden_dim, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # This part deserves some explanation # It is the attention layer # In the paper they use a 6 * dim dimensional vector # here we split it in three parts because the different parts # participate in very different operations # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w3 * (h.*u) ws1 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(2 * self.hidden_dim, 1), init=C.glorot_uniform()) ws3 = C.parameter(shape=(1, 2 * self.hidden_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) wu = C.reshape(C.times(qvw, ws2), (-1, )) whu = C.reshape( C.reduce_sum(c_processed * C.sequence.broadcast_as(qvw * ws3, c_processed), axis=1), (-1, )) S = wh + whu + C.sequence.broadcast_as(wu, c_processed) + att_bias # mask out values outside of Query, and fill in gaps with -1e+30 as neutral value for both reduce_log_sum_exp and reduce_max qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S), (-1, 1)) #q_attn = print_node(q_attn) c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) max_col = C.reduce_max(S) c_attn = C.sequence.softmax(max_col) htilde = C.sequence.reduce_sum(c_processed * c_attn) q2c = C.sequence.broadcast_as(htilde, c_processed) q2c_out = c_processed * q2c att_context = C.splice(c_processed, c2q, c_processed * c2q, q2c_out) return C.as_block(att_context, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def attention_pooling(inputs, inputs_mask, inputs_weights, decode, decode_weights, keys): """ inputs: shape=(n, dim) inputs_weight: shape=(dim, dim) decode: shape=(1, dec_dim) decode_weights: shape=(dec_dim, dim) keys: shape=(dim, 1) """ w_in = C.times(inputs, inputs_weights) #shape=(n, dim) w_dec = C.times(decode, decode_weights) #shape=(dim, 1) S = C.tanh(w_in + C.sequence.broadcast_as(w_dec, w_in)) #shape=(n, dim) S = C.element_select(inputs_mask, S, C.constant(-1e+30)) S = C.times(S, keys) #shape=(n) S = C.ops.sequence.softmax(S, name="softmax") attention = C.reduce_sum(inputs * S, axis=0) return attention
def attention_layer(self, context, query, dimc, dimq, common_dim): q_processed = C.placeholder(shape=(dimq, )) c_processed = C.placeholder(shape=(dimc, )) #convert query's sequence axis to static qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs # so W * [h; u; h.* u] becomes w1 * h + w2 * u + w4 * (h.*u) ws1 = C.parameter(shape=(dimc, 1), init=C.glorot_uniform()) ws2 = C.parameter(shape=(dimq, 1), init=C.glorot_uniform()) ws4 = C.parameter(shape=(1, common_dim), init=C.glorot_uniform()) att_bias = C.parameter(shape=(), init=0) wh = C.times(c_processed, ws1) # [#,c][1] wu = C.reshape(C.times(qvw, ws2), (-1, )) # [#][*] # qvw*ws4: [#][*,200], whu:[#,c][*] whu = C.reshape(C.reduce_sum( c_processed[:common_dim] *\ C.sequence.broadcast_as(qvw[:,:common_dim] * ws4, c_processed), axis=1), (-1,)) S1 = wh + C.sequence.broadcast_as(wu, c_processed) + att_bias # [#,c][*] qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, c_processed) S1 = C.element_select(qvw_mask_expanded, S1, C.constant(-1e+30)) q_attn = C.reshape(C.softmax(S1), (-1, 1)) # [#,c][*,1] c2q = C.reshape( C.reduce_sum(C.sequence.broadcast_as(qvw, q_attn) * q_attn, axis=0), (-1)) # [#,c][200] max_col = C.reduce_max(S1) # [#,c][1] 最大的q中的单词 c_attn = C.sequence.softmax(max_col) # [#,c][1] 对c中的每一个单词做softmax htilde = C.sequence.reduce_sum(c_processed * c_attn) # [#][200] q2c = C.sequence.broadcast_as(htilde, c_processed) # [#,c][200] q2c_out = c_processed[:common_dim] * q2c[:common_dim] # 原始文档,题目表示,文章重点表示,匹配度表示,文章上下文表示 att_context_reg = C.splice(c_processed, c2q, q2c_out, c_processed[:common_dim] * c2q[:common_dim]) res = C.combine(att_context_reg, C.reshape(q_attn, (-1, ))) return \ C.as_block(res, [(c_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def dot_attention(self, inputs, memory, dim): ''' @inputs: [#,c][d] a sequence need attention @memory(key): [#,q][d] a sequence input refers to compute similarity(weight) @value: [#,q][d] a sequence input refers to weighted sum @output: [#,c][d] attention vector ''' input_ph = C.placeholder() input_mem = C.placeholder() with C.layers.default_options( bias=False, activation=C.relu): # all the projections have no bias attn_proj_enc = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] unpack_memory, mem_mask = C.sequence.unpack( memory_, 0).outputs # [#][*=q, d], [#][*=q] unpack_memory_expand = C.sequence.broadcast_as(unpack_memory, inputs_) # [#,c][*=q,d] matrix = C.times_transpose(inputs_, unpack_memory_expand) / ( dim**0.5) # [#,c][*=q] mem_mask_expand = C.sequence.broadcast_as(mem_mask, inputs_) # [#,c][*=q] matrix = C.element_select(mem_mask_expand, matrix, C.constant(-1e+30)) # [#,c][*=q] logits = C.reshape(C.softmax(matrix), (-1, 1)) # [#,c][*=q,1] # [#,c][*=q, d] memory_expand = C.sequence.broadcast_as( C.sequence.unpack(input_mem, 0, no_mask_output=True), input_ph) weighted_att = C.reshape(C.reduce_sum(logits * memory_expand, axis=0), (-1, )) # [#,c][d] return C.as_block(C.combine(weighted_att, logits), [(input_ph, inputs), (input_mem, memory)], 'dot attention', 'dot attention')
def inner(a): # a: [#, *] [static_axes, num_classes] k_values, k_indices = C.top_k(a, k=k, axis=axis).outputs # k_indices [#, *] [static_axes, k] b = C.one_hot(k_indices, num_classes) # b: [#, *] [static_axes, k, num_classes] valid_probabilities = C.squeeze(C.reduce_sum(b, axis=-2), axes=(-2, )) # valid_probabilities: [#, *] [static_axes, num_classes] # k largest probabilies are retained, everything else is set to -inf and will not be sampled minus_inf = C.constant(-1e+30) d = a * valid_probabilities e = C.element_select(d, d, minus_inf) # e: [#, *] [static_axes, num_classes] # sample from top_k distribution once s = sample(e, axis=axis, name=name) # s: [#, *] [static_axes, num_classes] return s
def build(self): input_kernel = C.Parameter(shape=(self._input_size, self._hidden_dim), init=self._input_initializer) recur_kernel = C.Parameter(shape=(self._hidden_dim, ), init=self._recurrent_initializer) bias = C.Parameter(shape=(self._hidden_dim), init=0) if self._recurrent_min_abs > 0: abs_kernel = C.abs(recur_kernel) min_abs_kernel = C.element_max(abs_kernel, self._recurrent_min_abs) recur_kernel = min_abs_kernel * C.element_select( C.greater_equal(recur_kernel, C.constant(0)), C.constant(1), C.constant(-1)) if self._recurrent_max_abs: recur_kernel = C.clip(recur_kernel, -self._recurrent_max_abs, self._recurrent_max_abs) @C.Function def runit(h, x): h_t = C.times(x, input_kernel) + bias + recur_kernel * h return h_t return runit
def element_select(flag, value_if_true, value_if_false, name=''): ''' return either value_if_true or value_if_false based on the value of flag. If flag != 0 value_if_true is returned, otherwise value_if_false. Behaves analogously to numpy.where(...). Example: >>> C.eval(C.cond([-10, -1, 0, 0.3, 100], [1, 10, 100, 1000, 10000], [ 2, 20, 200, 2000, 20000])) [array([[ 1.00000000e+00, 1.00000000e+01, 2.00000000e+02, 1.00000000e+03, 1.00000000e+04]])] Args: flag: tensor value_if_true: tensor value_if_false: tensor name (str): the name of the node in the network Returns: :class:`cntk.Function` ''' from cntk import element_select flag = sanitize_input(flag) value_if_true = sanitize_input(value_if_true) value_if_false = sanitize_input(value_if_false) return element_select(flag, value_if_true, value_if_false, name).output()
def attention_layer(self, context, query, dim): input_ph = C.placeholder(shape=(dim, )) input_mem = C.placeholder(shape=(dim, )) with C.layers.default_options(bias=False, activation=C.relu): attn_proj_enc = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1, name="Wqu") attn_proj_dec = C.layers.Dense(self.hidden_dim, init=glorot_uniform(), input_rank=1) inputs_ = attn_proj_enc(input_ph) # [#,c][d] memory_ = attn_proj_dec(input_mem) # [#,q][d] cln_mem_ph = C.placeholder() # [#,q][?=d] cln_inp_ph = C.placeholder() # [#,c][?=d] unpack_inputs, inputs_mask = C.sequence.unpack( cln_inp_ph, 0).outputs # [#][*=c,d] [#][*=c] expand_inputs = C.sequence.broadcast_as(unpack_inputs, cln_mem_ph) # [#,q][*=c,d] matrix = C.reshape( C.times_transpose(cln_mem_ph, expand_inputs) / (self.hidden_dim**0.5), (-1, )) # [#,q][*=c] matrix = C.element_select( C.sequence.broadcast_as(inputs_mask, cln_mem_ph), matrix, C.constant(-1e30)) logits = C.softmax(matrix, axis=0, name='level 1 weight') # [#,q][*=c] trans_expand_inputs = C.transpose(expand_inputs, [1, 0]) # [#,q][d,*=c] q_over_c = C.reshape( C.reduce_sum(logits * trans_expand_inputs, axis=1), (-1, )) / (self.hidden_dim**0.5) # [#,q][d] new_q = C.splice(cln_mem_ph, q_over_c) # [#,q][2*d] # over unpack_matrix, matrix_mask = C.sequence.unpack( matrix, 0).outputs # [#][*=q,*=c] [#][*=q] inputs_mask_s = C.to_sequence(C.reshape(inputs_mask, (-1, 1))) # [#,c'][1] trans_matrix = C.to_sequence_like(C.transpose(unpack_matrix, [1, 0]), inputs_mask_s) # [#,c'][*=q] trans_matrix = C.sequence.gather(trans_matrix, inputs_mask_s) # [#,c2][*=q] trans_matrix = C.element_select( C.sequence.broadcast_as(matrix_mask, trans_matrix), trans_matrix, C.constant(-1e30)) logits2 = C.softmax(trans_matrix, axis=0, name='level 2 weight') # [#,c2][*=c] unpack_new_q, new_q_mask = C.sequence.unpack( new_q, 0).outputs # [#][*=q,2*d] [#][*=q] expand_new_q = C.transpose( C.sequence.broadcast_as(unpack_new_q, trans_matrix), [1, 0]) # [#,c2][2d,*=q] c_over_q = C.reshape(C.reduce_sum(logits2 * expand_new_q, axis=1), (-1, )) / (2 * self.hidden_dim)**0.5 # [#,c2][2d] c_over_q = C.reconcile_dynamic_axes(c_over_q, cln_inp_ph) weighted_q = c_over_q.clone(C.CloneMethod.share, { cln_mem_ph: memory_, cln_inp_ph: inputs_ }) # [#,c][2d] c2c = q_over_c.clone(C.CloneMethod.share, { cln_mem_ph: inputs_, cln_inp_ph: inputs_ }) # [#,c][2d] att_context = C.splice(input_ph, weighted_q, c2c) # 2d+2d+2d return C.as_block(att_context, [(input_ph, context), (input_mem, query)], 'attention_layer', 'attention_layer')
def word_level_drop(self, doc): # doc [#, c][d] seq_shape = C.sequence.is_first(doc) u = C.random.uniform_like(seq_shape, seed=98052) mask = C.element_select(C.greater(u, 0.08), 1.0, 0) return doc * mask
def pad_ctc_labels(ctc_labels, network_output): """ Pads the shorter truth label sequence to the same sequence length as the network output. This should be used when the final sequence length of the network output cannot be determined beforehand during the pre-processing of the ctc_labels. Thus, the padding is done during training runtime instead of during the data pipeline processing. The padding token would be the last sequence element of `ctc_labels`. `ctc_labels` should be a one hot encoded vector sequence. The padding token will have the value of 1 in its one-hot encoded vector. Example: # first example labels = C.sequence.input_variable(10) network_outputs = model(...) padded_labels = pad_ctc_labels(labels, network_outputs) # second example a = C.sequence.input_variable(3, sequence_axis=ax1) b = C.sequence.input_variable(6, sequence_axis=ax2) c = pad_ctc_labels(a, b) padding_token = np.array([0, 0, 1]) n1 = [np.array([[0, 2, 0], [2, 0, 0], [0, 0, 2], ]).astype(np.float32), ] n2 = [np.random.random((20, 6)).astype(np.float32), np.random.random((22, 6)).astype(np.float32), np.random.random((24, 6)).astype(np.float32), ] n1 = n1 * len(n2) results = c.eval({a: n1, b: n2}) for seq, result in zip(n2, results): for r in results[3:]: assert np.all(r == padding_token) assert result.shape[0] == seq.shape[0] Arguments: ctc_labels: one-hot-encoded ctc labels tensor network_output: output from model network Returns: :class:`~cntk.ops.functions.Function` a sequence tensor with the same sequence axis as network_output and ctc padded """ last_labels = C.sequence.last( ctc_labels ) # last token has one-hot-encode value of 2 for ctc training last_labels = C.element_select(last_labels, 1, 0) # replace value of 2 with 1 padded_labels = pad_to(ctc_labels, network_output, padding_token=last_labels) return padded_labels
def create_network(input_vocab_dim, label_vocab_dim): # network complexity; initially low for faster testing hidden_dim = 256 num_layers = 1 # Source and target inputs to the model input_seq_axis = Axis('inputAxis') label_seq_axis = Axis('labelAxis') raw_input = sequence.input(shape=(input_vocab_dim), sequence_axis=input_seq_axis, name='raw_input') raw_labels = sequence.input(shape=(label_vocab_dim), sequence_axis=label_seq_axis, name='raw_labels') # Instantiate the sequence to sequence translation model input_sequence = raw_input # Drop the sentence start token from the label, for decoder training label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s> label_sentence_start = sequence.first(raw_labels) # <s> is_first_label = sequence.is_first(label_sequence) # <s> 0 0 0 ... label_sentence_start_scattered = sequence.scatter( label_sentence_start, is_first_label) # Encoder encoder_outputH = stabilize(input_sequence) for i in range(0, num_layers): (encoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( encoder_outputH.output, hidden_dim, hidden_dim, future_value, future_value) thought_vectorH = sequence.first(encoder_outputH) thought_vectorC = sequence.first(encoder_outputC) thought_vector_broadcastH = sequence.broadcast_as( thought_vectorH, label_sequence) thought_vector_broadcastC = sequence.broadcast_as( thought_vectorC, label_sequence) # Decoder decoder_history_hook = alias(label_sequence, name='decoder_history_hook') # copy label_sequence decoder_input = element_select(is_first_label, label_sentence_start_scattered, past_value( decoder_history_hook)) decoder_outputH = stabilize(decoder_input) for i in range(0, num_layers): if (i > 0): recurrence_hookH = past_value recurrence_hookC = past_value else: isFirst = sequence.is_first(label_sequence) recurrence_hookH = lambda operand: element_select( isFirst, thought_vector_broadcastH, past_value(operand)) recurrence_hookC = lambda operand: element_select( isFirst, thought_vector_broadcastC, past_value(operand)) (decoder_outputH, encoder_outputC) = LSTMP_component_with_self_stabilization( decoder_outputH.output, hidden_dim, hidden_dim, recurrence_hookH, recurrence_hookC) decoder_output = decoder_outputH # Softmax output layer z = linear_layer(stabilize(decoder_output), label_vocab_dim) # Criterion nodes ce = cross_entropy_with_softmax(z, label_sequence) errs = classification_error(z, label_sequence) # network output for decoder history net_output = hardmax(z) # make a clone of the graph where the ground truth is replaced by the network output ng = z.clone(CloneMethod.share, {decoder_history_hook.output : net_output.output}) return { 'raw_input' : raw_input, 'raw_labels' : raw_labels, 'ce' : ce, 'pe' : errs, 'ng' : ng, 'output': z }