def test_swapaxes_0d_1d_operands(): x1 = C.input_variable(()) with pytest.raises(ValueError): swapaxes_0d = C.swapaxes(x1) x2 = C.input_variable(2) with pytest.raises(ValueError): swapaxes_1d = C.swapaxes(x2)
def test_swapaxes_0d_1d_operands(): x1 = C.input_variable(()) with pytest.raises(ValueError): swapaxes_0d = C.swapaxes(x1) x2 = C.input_variable(2) with pytest.raises(ValueError): swapaxes_1d = C.swapaxes(x2)
def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def cumsum(x, axis=0): dim = x.shape[axis] print('dim') print(dim) U = C.constant(np.triu(np.ones((dim, dim))).astype(x.dtype)) print('U') print(U) if axis != -1: x = C.swapaxes(x, -1, axis) print('swapped') print(x()) out = C.times(x, U) if axis != -1: out = C.swapaxes(out, -1, axis) return out
def attention(encoded, network): abk = dense(network) a, b, k = gaussian_windows_attention_coefficients(abk, nb_mixtures) # print("abk shape:", a.shape, b.shape, k.shape) # a, b, k: [#, n] [nb_mixture, 1] # context: [#, c] [char_ohe] encoded_unpacked = C.sequence.unpack(encoded, padding_value=0, no_mask_output=True) # context_unpacked: [#] [*=c, char_ohe] u = Cx.sequence.position(encoded) # position gives shape=(1, ) # u: [#, c], [1] u_values, u_valid = C.sequence.unpack(u, padding_value=999_999).outputs # u_values: [#] [*=c, 1] # u_valid: [#] [*=c] u_values_broadcast = C.swapaxes(C.sequence.broadcast_as(u_values, k)) # u_values_broadcast: [#, n] [1, *=c] u_valid_broadcast = C.sequence.broadcast_as(C.reshape(u_valid, (1,), 1), k) # u_valid_broadcast: [#, n] [*=c, 1] ~ shape verified correct at his point # print("u_values_broadcast shape:", u_values_broadcast.shape) # print("abk shape:", a.shape, b.shape, k.shape) phi = window_weight(a, b, k, u_values_broadcast) # phi: [#, n] [*=c, 1] zero = C.constant(0) phi = C.element_select(u_valid_broadcast, phi, zero, name="phi") # phi: [#, n] [*=c, 1] attended = C.reduce_sum(phi * C.sequence.broadcast_as(encoded_unpacked, phi), axis=0) # [#, n] [1, char_ohe] # print("attended_context shape:", attended_context.shape) output = C.squeeze(attended, name="GaussianWindowAttention") # [#, n] [char_ohe] return output
def scale_dot_product_attention_block(self, contextQ, contextV, contextK, name): Q = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) V = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) K = C.placeholder(shape=(2 * self.hidden_dim, ), dynamic_axes=[self.b_axis, self.q_axis]) Ql = C.layers.Dense(100)(Q) Vl = C.layers.Dense(100)(V) Kl = C.layers.Dense(100)(K) kvw, kvw_mask = C.sequence.unpack(Kl, padding_value=0).outputs vvw, _ = C.sequence.unpack(Vl, padding_value=0).outputs KT = C.swapaxes(kvw) S = C.reshape(C.times(Ql, KT) / math.sqrt(100), -1) kvw_mask_expanded = C.sequence.broadcast_as(kvw_mask, Ql) S = C.softmax( C.element_select(kvw_mask_expanded, S, C.constant(-1e+30))) att = C.times(S, vvw) return C.as_block(att, [(Q, contextQ), (V, contextV), (K, contextK)], 'sdp_attention_block' + name, 'sdp_attention_block' + name)
def window_weight(a, b, k, u): """ Calculate Phi is the window weight of character seq at position u of time t. Function tested to be correct on 2018-25-02 using numpy equivalent math: phi = summation of mixtures { a * exp ( -b * (k - u) ^ 2 ) } Args: a: importance of window within the mixture. Not normalised and doesn't sum to one. b: width of attention window k: location of window u: integer position of each item in sequence. Value from 1 to seq_length. (rank 2 tensor) [-3, 1] Returns: :class:`~cntk.ops.functions.Function` """ # print(f"k shape: {k.shape}, u shape: {u.shape}") phi = a * C.exp(-1 * b * C.square(k - u)) # print("internal phi shape:", phi.shape) phi = C.swapaxes(C.reduce_sum(phi, axis=0)) # Reduce sum the mixture axis # phi: [#, n] [*-c, 1] return phi
def cumsum(x, axis: int = -1): """ Calculates the cumulative sum across a static axis Arguments: x: input tensor axis (int): static axis of tensor to cumsum over Returns: :class:`~cntk.ops.functions.Function` """ d = x.shape[axis] u = C.constant(np.triu(np.ones((d, d))).astype(x.dtype)) if axis != -1: x = C.swapaxes(x, -1, axis) z = C.times(x, u) if axis != -1: z = C.swapaxes(z, -1, axis) return z
def swapaxes(t, axis1, axis2): if K.backend() == "cntk": return C.swapaxes(t, axis1=(axis1 - 1), axis2=(axis2 - 1)) # 0, 3, 2, 1, 4 else: swap = np.arange(K.ndim(t)) swap[axis1] = axis2 swap[axis2] = axis1 return K.permute_dimensions(t, swap)
def attention_layer(self, context, query, layer): q_processed = C.placeholder(shape=(2*self.hidden_dim,)) p_processed = C.placeholder(shape=(2*self.hidden_dim,)) qvw, qvw_mask = C.sequence.unpack(q_processed, padding_value=0).outputs wq = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wp = C.parameter(shape=(2*self.hidden_dim, 2*self.hidden_dim), init=C.glorot_uniform()) wg = C.parameter(shape=(8*self.hidden_dim, 8*self.hidden_dim), init=C.glorot_uniform()) v = C.parameter(shape=(2*self.hidden_dim, 1), init=C.glorot_uniform()) # seq[tensor[2d]] p_len x 2d wpt = C.reshape(C.times(p_processed, wp), (-1, 2*self.hidden_dim)) # q_len x 2d wqt = C.reshape(C.times(qvw, wq), (-1, 2*self.hidden_dim)) # seq[tensor[q_len]] S = C.reshape(C.times(C.tanh(C.sequence.broadcast_as(wqt, p_processed) + wpt), v), (-1)) qvw_mask_expanded = C.sequence.broadcast_as(qvw_mask, p_processed) # seq[tensor[q_len]] S = C.element_select(qvw_mask_expanded, S, C.constant(-1e+30)) # seq[tensor[q_len]] A = C.softmax(S, axis=0) # seq[tensor[2d]] swap_qvw = C.swapaxes(qvw) cq = C.reshape(C.reduce_sum(A * C.sequence.broadcast_as(swap_qvw, A), axis=1), (-1)) # seq[tensor[4d]] uc_concat = C.splice(p_processed, cq, p_processed * cq, cq * cq) # seq[tensor[4d]] gt = C.tanh(C.times(uc_concat, wg)) # seq[tensor[4d]] uc_concat_star = gt * uc_concat # seq[tensor[4d]] vp = C.layers.Sequential([ C.layers.Dropout(self.dropout), OptimizedRnnStack(self.hidden_dim, bidirectional=True, use_cudnn=self.use_cudnn, name=layer+'_attention_rnn')])(uc_concat_star) return C.as_block( vp, [(p_processed, context), (q_processed, query)], 'attention_layer', 'attention_layer')
def group_lstm(dh, dc, x): x_grps = split(x, groups).outputs dh_grps = split(dh, groups).outputs dc_grps = split(dc, groups).outputs h_grps = [] c_grps = [] for lstm, h_grp, c_grp, x_grp in zip(lstms, dh_grps, dc_grps, x_grps): h, c = lstm(h_grp, c_grp, x_grp).outputs h_grps.append(h) c_grps.append(c) # inter-group correlation through permutation of dimensions h_output = C.reshape( C.swapaxes(C.splice(*h_grps, axis=C.Axis.new_leading_axis())), (shape, )) c_output = C.reshape( C.swapaxes(C.splice(*c_grps, axis=C.Axis.new_leading_axis())), (shape, )) return h_output, c_output
def var(array,W=_W,B=None,square=0,sqrt=0,V=False,sizz=0): #W=tf.transpose(W, [0,2,3,1]) arrs=array.shape ashp=W.shape sb=(W.shape[1],1,1) WV=W.shape[-2:] xi=(-2,-1) x2=(-2,-1,-3) if V: print(W.eval()) print(arrs,ashp) mul=(array*W) if V: print('Wsamp',W[-1,-1].eval()) print('array*w',(mul.eval())[0,-1]) size=C.reduce_sum(W,axis=xi)#shape=(outputs, channel) if V: print("sizesamp",size.shape,size.eval()) if B is None: B=C.constant(0,shape=W.shape[0:2],dtype=np.float32)#channel B=C.reshape(B,(*B.shape,*[1 for _ in range(len(ashp)-len(B.shape))])) if sizz==1: mean=C.reduce_sum(mul,axis=xi)/size else: mean=C.reduce_sum(mul,axis=xi)/C.constant(value=WV[0]*WV[1],shape=sb,dtype=np.float32) if V: print("meansamp",mean.eval()[0,-1]) if square: i=(C.square(mul-mean)+B) else: i=(((mul)-mean)+B) di=i/size if V==2: print("i",i.eval(),"i") print("di",di.eval(),"di") if V: print('isamp',i.shape,i.eval()[-1,-1,]) out=C.reduce_sum(i+B,axis=x2) #out=np.rollaxis(np.sum(i+B,axis=x2),-1,1) print(out.shape) if sqrt: out=C.sqrt(out) out=C.swapaxes(C.reshape(out,out.shape[:4]), 3, 1) print(out.shape) assert out.shape==(arrs[0],ashp[0],arrs[1],arrs[2]) return(out)
def model(query, key, value): q = phi(query_linear(query)) k = phi(key_linear(key)) v = value_linear(value) # key and value should have the same sequence length k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True) # k_unpacked: [#] [*kv=, model_dim] v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True) # v_unpacked: [#] [*kv=, hidden_dim] kv = C.times(C.swapaxes(k_unpacked), v_unpacked) # kv [#] [model_dim, hidden_dim] kv_broadcasted = C.sequence.broadcast_as(kv, q) # this can be reused across queries # kv [#, *] [model_dim, hidden_dim] numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted)) # numerator [#, *] [hidden_dim, ] denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q)) # denom [#, *] [1] return numerator / denom
def test_TransposeAxes(tmpdir): data = [[[0, 1], [2, 3], [4, 5]]] model = C.swapaxes(data, 1, 2) verify_no_input(model, tmpdir, 'TransposeAxes_0')
def test_TransposeAxes(tmpdir, dtype): with C.default_options(dtype=dtype): data = np.array([[[0, 1], [2, 3], [4, 5]]]).astype(dtype) model = C.swapaxes(data, 1, 2) verify_no_input(model, tmpdir, 'TransposeAxes_0')
def test_TransposeAxes(tmpdir, dtype): with C.default_options(dtype = dtype): data = np.array([[[0,1],[2,3],[4,5]]]).astype(dtype) model = C.swapaxes(data, 1, 2) verify_no_input(model, tmpdir, 'TransposeAxes_0')