def model(seq_image, decoded): params = dense(decoded) g_x, g_y, sigma2, delta, gamma = attention_parameters(params) i = C.Constant(np.arange(n) + 1, ) # col of patch j = C.Constant(np.arange(n) + 1, ) # row of patch mu_x = g_x + (i - n / 2 - 0.5) * delta mu_y = g_y + (j - n / 2 - 0.5) * delta mu_x = C.expand_dims(mu_x, axis=-1) mu_y = C.expand_dims(mu_y, axis=-1) # mu_x: [#, *] [n, 1] # mu_y: [#, *] [n, 1] image = C.sequence.unpack(seq_image, padding_value=0, no_mask_output=True) # image: [#] [*image_width, filters, image_height] width_pos = Cx.sequence.position(seq_image) # width_pos: [#, *] [1] width_pos_unpacked = C.sequence.unpack(width_pos, padding_value=999_999, no_mask_output=True) # width_pos: [#] [*image_width, 1] a = C.sequence.broadcast_as(C.swapaxes(width_pos_unpacked), mu_x) # a: [#, *] [1, *image_width] # x pos index of image (width) b = C.Constant(np.arange(image_height).reshape((1, -1))) # b: [] [1, image_height] # y pos index of image (height) # calculate the which portion of the image that is attended by the gaussian filter f_xi = C.exp(-0.5 * C.square(a - mu_x) / sigma2) f_yj = C.exp(-0.5 * C.square(b - mu_y) / sigma2) # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] z_x = C.reduce_sum(f_xi, axis=1) z_y = C.reduce_sum(f_yj, axis=1) # z_x: [#, *] [n] # z_y: [#, *] [n] f_xi = f_xi / z_x f_yj = f_yj / z_y # f_xi: [#, *] [n, *image_width] # f_yj: [#, *] [n, image_height] # combine filters from x and y image_broadcasted = C.sequence.broadcast_as(image, f_yj) attended = gamma * C.times( f_xi, C.times_transpose(image_broadcasted, f_yj), output_rank=2) # attended: [#, *] [n, filters, n] attended = C.swapaxes(attended) # attended: [#, *] [filters, n (x) , n (y)] return attended
def gaussian_windows_attention_coefficients(abk, nb_mixtures): """ Split into 3 equal tensor of dim nb_mixtures """ a = C.slice(abk, 0, 0, nb_mixtures) b = C.slice(abk, 0, nb_mixtures, 2 * nb_mixtures) k = C.slice(abk, 0, 2 * nb_mixtures, 0) k = Recurrence(C.plus)(k) a = C.expand_dims(a, axis=-1) b = C.expand_dims(b, axis=-1) k = C.expand_dims(k, axis=-1) return a, b, k
def sample_gaussian_mdn(prediction_tensor, nmix: int, ndim: int): """ Constructs sampling nodes from mixture density network outputs Example: ndim, nmix = 1, 3 a = C.input_variable(ndim) prediction = Dense((ndim + 2) * nmix)(a) sampled = sample_gaussian_mdn(prediction, nmix, ndim) results = sampled.eval({a: x}) # different results every time you eval Arguments: prediction_tensor: input tensor nmix (int): number of mixture ndim (int): number of dimension of gaussian Returns: :class:`~cntk.ops.functions.Function` """ alpha_tensor, mu_tensor, sigma_tensor = gaussian_mdn_coeff( prediction_tensor, nmix=nmix, ndim=ndim) selected_alpha = random.sample(alpha_tensor) selected_mu_tensor = C.reduce_sum(mu_tensor * C.expand_dims(selected_alpha, axis=-1), axis=0) selected_sigma_tensor = C.reduce_sum(sigma_tensor * selected_alpha, axis=0) sampled = C.random.normal_like( selected_sigma_tensor) * selected_sigma_tensor + selected_mu_tensor return sampled
def inner(a): values, valid = C.sequence.unpack(a, padding_value=0).outputs values_reversed = C.slice(values, 0, 0, 0, -1) valid_reversed = C.slice(valid, 0, 0, 0, -1) values_seq = C.to_sequence(values_reversed) valid_seq = C.to_sequence(C.expand_dims(valid_reversed, axis=-1)) a_reversed = C.sequence.gather(values_seq, valid_seq) return a_reversed
def zeros_like(x, seq_length: int): """ helper function to construct a sequence of zeros """ if seq_length > 1: b = C.zeros_like(C.sequence.slice(x, 0, seq_length)) elif seq_length == 1: b = C.to_sequence( C.expand_dims(C.zeros_like(C.sequence.first(x)), axis=C.Axis.new_leading_axis())) else: raise ValueError(f"length ({seq_length}) must be larger than 0") return b
def inner(a, b): a_unpacked, a_mask = C.sequence.unpack(a, padding_value=0).outputs b_unpacked, b_mask = C.sequence.unpack(b, padding_value=0).outputs ab_unpacked = C.splice(a_unpacked, b_unpacked, axis=0) ab_mask = C.expand_dims(C.splice(a_mask, b_mask), axis=-1) ab_w_pad = C.to_sequence(ab_unpacked) ab_condition = C.to_sequence(ab_mask) ab = C.sequence.gather(ab_w_pad, ab_condition) return ab
def test_expand_dims(operand_shape, axis, device_id, precision): if axis is None or isinstance(axis, tuple): return operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f') expected = np.expand_dims(operand, axis) expected_forward = [expected] expected_backward = { 'arg': [np.ones_like(operand)], } from .. import expand_dims, placeholder p = C.placeholder() expand_dims_with_axis = C.expand_dims(p, axis) _test_unary_op(precision, device_id, expand_dims_with_axis, operand, expected_forward, expected_backward)
def test_expand_dims(operand_shape, axis, device_id, precision): if axis is None or isinstance(axis, tuple): return operand = np.arange(np.prod(operand_shape)).reshape(operand_shape).astype('f') expected = np.expand_dims(operand, axis) expected_forward = [expected] expected_backward = { 'arg': [np.ones_like(operand)], } from .. import expand_dims, placeholder p = C.placeholder() expand_dims_with_axis = C.expand_dims(p, axis) _test_unary_op(precision, device_id, expand_dims_with_axis, operand, expected_forward, expected_backward)
def model(query, key, value): q = phi(query_linear(query)) k = phi(key_linear(key)) v = value_linear(value) # key and value should have the same sequence length k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True) # k_unpacked: [#] [*kv=, model_dim] v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True) # v_unpacked: [#] [*kv=, hidden_dim] kv = C.times(C.swapaxes(k_unpacked), v_unpacked) # kv [#] [model_dim, hidden_dim] kv_broadcasted = C.sequence.broadcast_as(kv, q) # this can be reused across queries # kv [#, *] [model_dim, hidden_dim] numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted)) # numerator [#, *] [hidden_dim, ] denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q)) # denom [#, *] [1] return numerator / denom
def gaussian_mdn_phi(target, mu, sigma, ndim: int): """ Calculates phi between the target tensor and the network prediction Does not assumes independence between components of target. Arguments: target: target tensor with shape (ndim, ) mu: means of gaussian mdn with shape (nmix, ndim) sigma: sigma of gaussian mdn nmix (int): number of mixtures ndim (int): number of dimensions in gaussian Returns: :class:`~cntk.ops.functions.Function` """ if not len(mu.shape) == 2: raise ValueError("mu {0} must have shape (nmix, ndim)".format(mu.shape)) t = C.expand_dims(target, axis=0) exp_term = C.exp(C.negate(C.square(C.reduce_l2(t - mu, axis=-1)) / (2 * C.square(sigma)))) factor = C.reciprocal((2 * pi) ** (ndim / 2) * C.pow(sigma, ndim)) return factor * exp_term
def inner(a): # reconcile_dynamic_axes is necessary to avoid subtle bugs e.g. sequence.where and one_hot return C.expand_dims(C.reconcile_dynamic_axes( C.sequence.where(C.sequence.broadcast_as(1, a)), a), axis=-1)
def inner(a): return C.expand_dims(C.sequence.reduce_sum( C.sequence.broadcast_as(1, a)), axis=C.Axis.new_leading_axis())
def seq_op_func(seqinp): l = seqinp r = C.sequence.future_value(l) r = C.expand_dims(r, -len(seqinp.shape) - 1) res = l + r return res
def main(): show_image = False if show_image: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 256 w = 256 else: bs = 1 ci = 3 co = 3 cg = co * (ci + 1) gd = 8 gh = 64 gw = 64 h = 1024 w = 1024 im = C.input_variable([bs, ci, h, w], needs_gradient=True, dynamic_axes=[]) guide = C.input_variable([bs, h, w], needs_gradient=True, dynamic_axes=[]) guide_no_grad = C.input_variable([bs, h, w], needs_gradient=False, dynamic_axes=[]) grid = C.input_variable([bs, cg, gd, gh, gw], needs_gradient=True, dynamic_axes=[]) # Create indices xx = np.arange(0, w).reshape(1, -1).repeat(h, 0).astype(np.float32) yy = np.arange(0, h).reshape(-1, 1).repeat(w, 1).astype(np.float32) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) gx = ((xx + 0.5) / w) * gw gy = ((yy + 0.5) / h) * gh gz = C.clip(guide, 0.0, 1.0) * gd gz_no_grad = C.clip(guide_no_grad, 0.0, 1.0) * gd fx = C.element_max(C.floor(gx - 0.5), 0.0) fy = C.element_max(C.floor(gy - 0.5), 0.0) fz = C.element_max(C.floor(gz - 0.5), 0.0) fz_no_grad = C.element_max(C.floor(gz_no_grad - 0.5), 0.0) wx = gx - 0.5 - fx wy = gy - 0.5 - fy wx = C.expand_dims(C.expand_dims(wx, -1 - len(wx.shape)), -1 - len(wx.shape)) wy = C.expand_dims(C.expand_dims(wy, -1 - len(wy.shape)), -1 - len(wy.shape)) wz = C.abs(gz - 0.5 - fz) wz = C.expand_dims(wz, 0) fx = C.expand_dims(C.expand_dims(fx, -1 - len(fx.shape)), -1 - len(fx.shape)) fy = C.expand_dims(C.expand_dims(fy, -1 - len(fy.shape)), -1 - len(fy.shape)) cx = C.element_min(fx + 1, gw - 1) cy = C.element_min(fy + 1, gh - 1) cz = C.element_min(fz_no_grad + 1, gd - 1) batch_idx = np.arange(bs).reshape(bs, 1, 1, 1).astype(np.float32) batch_idx = C.Constant(batch_idx, batch_idx.shape) out = [] flat_grid = C.reshape(grid, [-1]) for c_ in range(co): c_idx = np.arange((ci + 1) * c_, (ci + 1) * (c_ + 1)).reshape(1, ci + 1, 1, 1).astype(np.float32) c_idx = C.Constant(c_idx, c_idx.shape) def flatten_and_gather(x, y, z): linear_idx = x + gw * y + gw * gh * z + c_idx * gw * gh * gd + batch_idx * gw * gh * gd * cg flat_linear_idx = C.reshape(linear_idx, [-1]) return C.reshape(C.gather(flat_grid, flat_linear_idx), linear_idx.shape) gather_fff = flatten_and_gather(fx, fy, fz_no_grad) gather_ffc = flatten_and_gather(fx, fy, cz) gather_fcf = flatten_and_gather(fx, cy, fz_no_grad) gather_fcc = flatten_and_gather(fx, cy, cz) gather_cff = flatten_and_gather(cx, fy, fz_no_grad) gather_cfc = flatten_and_gather(cx, fy, cz) gather_ccf = flatten_and_gather(cx, cy, fz_no_grad) gather_ccc = flatten_and_gather(cx, cy, cz) a = gather_fff*(1-wx)*(1-wy)*(1-wz) + \ gather_ffc*(1-wx)*(1-wy)*( wz) + \ gather_fcf*(1-wx)*( wy)*(1-wz) + \ gather_fcc*(1-wx)*( wy)*( wz) + \ gather_cff*( wx)*(1-wy)*(1-wz) + \ gather_cfc*( wx)*(1-wy)*( wz) + \ gather_ccf*( wx)*( wy)*(1-wz) + \ gather_ccc*( wx)*( wy)*( wz) o = C.reduce_sum(a[:, :-1, ...] * im, 1) + a[:, -1, ...] print(o.shape) out.append(C.expand_dims(o, 0)) out = C.splice(*out, axis=1) loss = C.reduce_l2(out) grid_val = np.random.rand(bs, cg, gd, gh, gw).astype(np.float32) if show_image: guide_val = skio.imread("/data/rgb.png").mean(2)[:h, :w].astype( np.float32) guide_val = np.expand_dims(guide_val / 255.0, 0) im_val = np.tile(np.expand_dims(guide_val, 1), [1, 3, 1, 1]) out_val = out.eval({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) out_val = np.clip(np.transpose(np.squeeze(out_val), [1, 2, 0]), 0, 1) skio.imsave("/output/imout.png", out_val) else: im_val = np.random.randn(bs, ci, h, w) guide_val = np.random.rand(bs, h, w).astype(np.float32) # burning iteration for it in range(5): print('burning (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) # actual iterations start = time.time() for it in range(50): print('profiling (', it, ')') g = loss.grad({ im: im_val, guide: guide_val, guide_no_grad: guide_val, grid: grid_val }) end = time.time() runtime = (end - start) * 1000.0 / 50.0 print('Runtime:', runtime)
def main(): bs = 4 c = 16 h = 512 w = 512 im = C.input_variable([bs, c, h, w], needs_gradient=True, dynamic_axes=[]) affine_mtx = C.input_variable([bs, 2, 3], needs_gradient=True, dynamic_axes=[]) affine_mtx_ng = C.input_variable([bs, 2, 3], needs_gradient=False, dynamic_axes=[]) xx = np.arange(0, w).reshape(1, -1).repeat(h, 0).astype(np.float32) yy = np.arange(0, h).reshape(-1, 1).repeat(w, 1).astype(np.float32) xx = C.Constant(xx, xx.shape) yy = C.Constant(yy, yy.shape) nrm_x = 2.0 * (xx / w) - 1.0 nrm_y = 2.0 * (yy / h) - 1.0 nrm_x = C.expand_dims(nrm_x, -1 - len(nrm_x.shape)) nrm_y = C.expand_dims(nrm_y, -1 - len(nrm_y.shape)) xformed_x = affine_mtx[:, 0, 0] * nrm_x + \ affine_mtx[:, 0, 1] * nrm_y + \ affine_mtx[:, 0, 2] xformed_y = affine_mtx[:, 1, 0] * nrm_x + \ affine_mtx[:, 1, 1] * nrm_y + \ affine_mtx[:, 1, 2] xformed_x = 0.5 * xformed_x + 1.0 xformed_y = 0.5 * xformed_y + 1.0 xformed_x = C.expand_dims(xformed_x, 0) xformed_y = C.expand_dims(xformed_y, 0) xformed_x_ng = affine_mtx_ng[:, 0, 0] * nrm_x + \ affine_mtx_ng[:, 0, 1] * nrm_y + \ affine_mtx_ng[:, 0, 2] xformed_y_ng = affine_mtx_ng[:, 1, 0] * nrm_x + \ affine_mtx_ng[:, 1, 1] * nrm_y + \ affine_mtx_ng[:, 1, 2] xformed_x_ng = C.expand_dims(xformed_x_ng, 0) xformed_y_ng = C.expand_dims(xformed_y_ng, 0) fx = C.clip(w * xformed_x, 0, w-2) fy = C.clip(h * xformed_y, 0, h-2) wx = xformed_x - fx wy = xformed_y - fy fx_ng = C.clip(w * xformed_x_ng, 0, w-2) fy_ng = C.clip(h * xformed_y_ng, 0, h-2) chan_idx = np.arange(c).reshape(1, c, 1, 1) chan_idx = C.Constant(chan_idx, chan_idx.shape) batch_idx = np.arange(bs).reshape(bs, 1, 1, 1) batch_idx = C.Constant(batch_idx, batch_idx.shape) flat_im = C.reshape(im, [-1]) def flatten_and_gather(x, y): linear_idx = x + w*y linear_idx = linear_idx + w*h*chan_idx + w*h*c*batch_idx flat_linear_idx = C.reshape(linear_idx, [-1]) return C.reshape(C.gather(flat_im, flat_linear_idx),linear_idx.shape) gather_ff = flatten_and_gather(fx_ng , fy_ng ) gather_fc = flatten_and_gather(fx_ng , fy_ng + 1) gather_cf = flatten_and_gather(fx_ng + 1, fy_ng ) gather_cc = flatten_and_gather(fx_ng + 1, fy_ng + 1) out = gather_ff*(1-wx)*(1-wy) + \ gather_fc*(1-wx)*( wy) + \ gather_cf*( wx)*(1-wy) + \ gather_cc*( wx)*( wy) loss = C.reduce_l2(out) im_val = np.random.randn(bs, c, h, w).astype(np.float32) affine_mtx_val = np.zeros([bs, 2, 3], dtype=np.float32) affine_mtx_val[:, 0, 1] = 1.0 affine_mtx_val[:, 1, 0] = 1.0 # burning iteration for it in range(5): print('burning (', it, ')') g = loss.grad({im : im_val, affine_mtx : affine_mtx_val, affine_mtx_ng : affine_mtx_val}) # actual iterations start = time.time() for it in range(50): print('profiling (', it, ')') g = loss.grad({im : im_val, affine_mtx : affine_mtx_val, affine_mtx_ng : affine_mtx_val}) end = time.time() runtime = (end-start)*1000.0/50.0 print('Runtime:', runtime)
def seq_op_func(seqinp): l = seqinp r = C.sequence.future_value(l) r = C.expand_dims(r, -len(seqinp.shape) - 1) res = l + r return res