def run(zero_point, scale): qparams = create_qparams(QuantMode.ASYMMERTIC, test_dtype, scale, zero_point) inp_data = np.random.uniform(low=-512.0, high=512.0, size=(1, 32, 32, 32)) inp = tensor(inp_data, dtype=np.float32) # test forward oup = fake_quant_tensor(inp, qparams).numpy() oup_gt = fake_quant_tensor_gt(inp, scale, zero_point, qmin, qmax).numpy() assert np.allclose(oup, oup_gt) assert oup.shape == oup_gt.shape # test backward x = tensor(inp_data, dtype=np.float32) with Grad() as grad: grad.wrt(x, callback=_save_to(x)) y = fake_quant_tensor(x, qparams) grad(y, tensor(F.ones_like(x))) x1 = tensor(inp_data, dtype=np.float32) with Grad() as grad: grad.wrt(x1, callback=_save_to(x1)) y1 = fake_quant_tensor_gt(x1, scale, zero_point, qmin, qmax) grad(y1, tensor(F.ones_like(x1))) assert np.allclose(x.grad.numpy(), x1.grad.numpy()) assert make_shape_tuple(x.grad.shape) == make_shape_tuple(x1.grad.shape) # test nan x = F.full((1, 32, 3, 3), np.nan) y = fake_quant_tensor(x, qparams).numpy() assert np.isnan(y).all()
def test_adaptive_max_pool2d(): inp = tensor(np.arange(0, 16, dtype=np.float32).reshape(1, 1, 4, 4)) oshp = (2, 2) grad = Grad().wrt(inp, callback=_save_to(inp)) outp = F.adaptive_max_pool2d( inp, oshp, ) assert make_shape_tuple(outp.shape) == ( inp.shape[0], inp.shape[1], *oshp, ) np.testing.assert_equal(outp.numpy(), np.array([[[[5, 7], [13, 15]]]], dtype=np.float32)) grad(outp, tensor(F.ones_like(outp))) assert make_shape_tuple(inp.grad.shape) == make_shape_tuple(inp.shape) np.testing.assert_equal( inp.grad.numpy(), np.array( [[[ [0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0], [0.0, 1.0, 0.0, 1.0], ]]], dtype=np.float32, ), )
def test_ShuffleRNG(): g = [] def cb(grad): g.append(grad) n, m = 6, 3 arr = np.arange(n * m) out0 = Tensor(arr, dtype="float32") with Grad() as grad: grad.wrt(out0, callback=cb) random.shuffle(out0) grad(out0, F.ones_like(out0)) m1 = RNG(seed=111, device="xpu0") m2 = RNG(seed=111, device="xpu1") m3 = RNG(seed=222, device="xpu0") out1 = Tensor(arr, dtype="float32", device="xpu0") out2 = Tensor(arr, dtype="float32", device="xpu1") out3 = Tensor(arr, dtype="float32", device="xpu0") m1.shuffle(out1) m2.shuffle(out2) m3.shuffle(out3) np.testing.assert_allclose(out1.numpy(), out2.numpy(), atol=1e-6) assert out1.device == "xpu0" and out2.device == "xpu1" assert not (out1.numpy() == out3.numpy()).all() out = Tensor(arr, dtype="float32").reshape(n, m) m1.shuffle(out) out_shp = out.shape if isinstance(out_shp, tuple): assert out_shp == (n, m) else: assert all(out.shape.numpy() == np.array([n, m]))
def test_roi_align(): inp_feat, rois = _gen_roi_inp() with Grad() as grad: grad.wrt(inp_feat, callback=_save_to(inp_feat)) output_shape = (7, 7) out_feat = F.vision.roi_align( inp_feat, rois, output_shape=output_shape, mode="average", spatial_scale=1.0 / 4, sample_points=2, aligned=True, ) assert make_shape_tuple(out_feat.shape) == ( rois.shape[0], inp_feat.shape[1], *output_shape, ) grad(out_feat, tensor(F.ones_like(out_feat))) assert make_shape_tuple(inp_feat.grad.shape) == make_shape_tuple( inp_feat.shape)
def test_Broadcast(): x_np = np.random.rand(3, 3, 1).astype("float32") x = TensorWrapper(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = F.broadcast_to(x, (3, 3, 10)) grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((3, 3, 1), dtype=np.float32) * 10, x.grad.numpy())
def test_reshape(): x_np = np.random.rand(2, 5).astype("float32") x = TensorWrapper(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = x.reshape(5, 2) grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((2, 5), dtype=np.float32), x.grad.numpy())
def test_resize(): x_np = np.random.rand(3, 3, 32, 32).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = F.resize(x, (16, 16)) grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones(x_np.shape, dtype=np.float32) / 4, x.grad.numpy())
def test_Reduce_sum(): x_np = np.random.rand(3, 3).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = x.sum(axis=0) grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((3, 3), dtype=np.float32), x.grad.numpy())
def test_AxisAddRemove(): x_np = np.random.rand(1, 5).astype("float32") x = TensorWrapper(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = F.squeeze(F.expand_dims(x, 2), 0) grad(y, F.ones_like(y)) np.testing.assert_equal(np.array([[1, 1, 1, 1, 1]], dtype=np.float32), x.grad.numpy())
def test_interpolate_fastpath(): x_np = np.random.rand(3, 3, 32, 32).astype("float32") x = mge.Tensor(x_np) with Grad() as grad: grad.wrt(x, callback=save_to(x)) y = F.vision.interpolate(x, size=(16, 16), mode="bilinear") grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones(x_np.shape, dtype=np.float32) / 4, x.grad.numpy())
def test_IndexingMultiAxisVec(): x_np = np.random.rand(3, 3).astype("float32") x = TensorWrapper(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = x[[0, 2], [0, 2]] grad(y, F.ones_like(y)) np.testing.assert_equal( np.array([[1, 0, 0], [0, 0, 0], [0, 0, 1]], dtype=np.float32), x.grad.numpy() )
def test_subtensor(): x_np = np.random.rand(3, 3).astype("float32") x = TensorWrapper(x_np) grad = Grad().wrt(x, callback=save_to(x)) y = x[1:-1, :2] grad(y, F.ones_like(y)) np.testing.assert_equal( np.array([[0, 0, 0], [1, 1, 0], [0, 0, 0]], dtype=np.float32), x.grad.numpy() )
def mesh_grid_mge(B, H, W): # mesh grid x_base = F.arange(0, W) x_base = F.tile(x_base, (B, H, 1)) y_base = F.arange(0, H) # BHW y_base = F.tile(y_base, (B, W, 1)).transpose(0, 2, 1) ones = F.ones_like(x_base) base_grid = F.stack([x_base, y_base, ones], 1) # B3HW return base_grid
def test_dot(): x = np.random.rand(2, 2).astype("float32") x = mge.Tensor(x) u = F.ones((2,)) v = F.ones((2,)) with Grad() as grad: grad.wrt(x, callback=save_to(x)) def f(x): return F.dot(u, F.matmul(x, v)) y = f(x) grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((2, 2), dtype=np.float32), x.grad.numpy())
def forward( self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, ): if attention_mask is None: attention_mask = F.ones_like(input_ids) if token_type_ids is None: token_type_ids = F.zeros_like(input_ids) # print('input_ids', input_ids.sum()) # We create a 3D attention mask from a 2D tensor mask. # Sizes are [batch_size, 1, 1, to_seq_length] # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] # this attention mask is more simple than the triangular masking of causal attention # used in OpenAI GPT, we just need to prepare the broadcast dimension here. # print('attention_mask', attention_mask.sum()) extended_attention_mask = F.expand_dims(attention_mask, (1, 2)) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.astype( next(self.parameters()).dtype ) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 embedding_output = self.embeddings(input_ids, token_type_ids) encoded_layers = self.encoder( embedding_output, extended_attention_mask, output_all_encoded_layers=output_all_encoded_layers, ) sequence_output = encoded_layers[-1] pooled_output = self.pooler(sequence_output) if not output_all_encoded_layers: encoded_layers = encoded_layers[-1] return encoded_layers, pooled_output
def test_removeAxis(): x_np = np.random.rand(3, 3, 1, 1).astype("float32") x = mge.Tensor(x_np) with Grad() as grad: grad.wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = F.squeeze(x, [2, 3]) refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((3, 3, 1, 1), dtype=np.float32), x.grad.numpy())
def test_addAxis(): x_np = np.random.rand(3, 3).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = F.expand_dims(x, [2, 3]) refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((3, 3), dtype=np.float32), x.grad.numpy())
def test_reshape(): x_np = np.random.rand(2, 5).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = x.reshape(5, 2) refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal(np.ones((2, 5), dtype=np.float32), x.grad.numpy())
def test_roi_pooling(): inp_feat, rois = _gen_roi_inp() grad = Grad().wrt(inp_feat, callback=_save_to(inp_feat)) output_shape = (7, 7) out_feat = F.vision.roi_pooling( inp_feat, rois, output_shape=output_shape, mode="max", scale=1.0 / 4, ) assert make_shape_tuple(out_feat.shape) == ( rois.shape[0], inp_feat.shape[1], *output_shape, ) grad(out_feat, tensor(F.ones_like(out_feat))) assert make_shape_tuple(inp_feat.grad.shape) == make_shape_tuple( inp_feat.shape)
def test_AxisAddRemove(): x_np = np.random.rand(1, 5).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = F.squeeze(F.expand_dims(x, 2), 0) refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal(np.array([[1, 1, 1, 1, 1]], dtype=np.float32), x.grad.numpy())
def test_subtensor(): x_np = np.random.rand(3, 3).astype("float32") x = mge.Tensor(x_np) with Grad() as grad: grad.wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = x[1:-1, :2] refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal( np.array([[0, 0, 0], [1, 1, 0], [0, 0, 0]], dtype=np.float32), x.grad.numpy() )
def test_IndexingMultiAxisVec(): x_np = np.random.rand(3, 3).astype("float32") x = mge.Tensor(x_np) grad = Grad().wrt(x, callback=save_to(x)) refs = {} def f(x): x = x * 1 y = x[[0, 2], [0, 2]] refs["x"] = TensorWeakRef(x) return y y = f(x) for _, r in refs.items(): assert r() is None grad(y, F.ones_like(y)) np.testing.assert_equal( np.array([[1, 0, 0], [0, 0, 0], [0, 0, 1]], dtype=np.float32), x.grad.numpy())
def forward(self, features, label=None, mask=None): """ if label and mask both None, the loss will degenerate to SimSLR unsupervised loss. Reference: "A Simple Framework for Contrastive Learning of Visual Representations"<https://arxiv.org/pdf/2002.05709.pdf> "Supervised Contrastive Learning"<https://arxiv.org/abs/2004.11362> Args: features(tensor): The embedding feature. shape=[bs, n_views, ...] label(tensor): The label of images, shape=[bs] mask(tensor): contrastive mask of shape [bsz, bsz], mask_{i,j}=1 if sample j has the same class as sample i. Can be asymmetric. return: loss """ if len(features.shape) < 3: raise ValueError("Features need have 3 dimensions at least") bs, num_view = features.shape[:2] #if dimension > 3, change the shape of the features to [bs, num_view, ...] if len(features.shape) > 3: features = features.reshape(bs, num_view, -1) #label and mask cannot provided at the same time if (label is not None) and (mask is not None): raise ValueError("label and mask cannot provided at the same time") elif (label is None) and (mask is None): mask = F.eye(bs, dtype="float32") elif label is not None: label = label.reshape(-1, 1) if label.shape[0] != bs: raise RuntimeError( "Num of labels does not match num of features") mask = F.equal(label, label.T) else: mask = mask.astype("float32") contrast_count = features.shape[1] features = F.split(features, features.shape[1], axis=1) contrast_feature = F.squeeze(F.concat(features, axis=0), axis=1) if self.contrast_mode == "one": anchor_feature = features[:, 0] anchor_count = 1 elif self.contrast_mode == "all": anchor_feature = contrast_feature anchor_count = contrast_count else: raise ValueError("Unknown mode:{}".format(self.contrast_mode)) #compute logits anchor_dot_contrast = F.div( F.matmul(anchor_feature, contrast_feature.T), self.temperate) #for numerical stability logits_max = F.max(anchor_dot_contrast, axis=-1, keepdims=True) logits = anchor_dot_contrast - logits_max #tile mask an1, con = mask.shape[:2] nums = anchor_count * contrast_count # mask-out self-contrast cases mask = F.stack([mask] * nums).reshape(an1 * anchor_count, con * contrast_count) logits_mask = F.scatter( F.ones_like(mask), 1, F.arange(0, int(bs * anchor_count), dtype="int32").reshape(-1, 1), F.zeros(int(bs * anchor_count), dtype="int32").reshape(-1, 1)) mask = mask * logits_mask #compute log_prob exp_logits = F.exp(logits) * logits_mask log_prob = logits - F.log(F.sum(exp_logits, axis=1, keepdims=True)) #equation 2 #mean mean_log_prob_pos = F.sum(mask * log_prob, axis=1) / F.sum(mask, axis=1) #loss loss = -(self.temperate / self.base_temperate) * mean_log_prob_pos loss = F.mean(loss.reshape(anchor_count, bs)) return loss
def f(x0, x1): with gm: y = x0 * x1 gm.backward(y, F.ones_like(y)) dx0 = x0.grad return y, dx0
def test_correlation(): ##test case 0 check the grad shape data1, data2 = _gen_correlation() grad = Grad().wrt(data1, callback=_save_to(data1)) out_feat = F.vision.correlation( data1, data2, kernel_size=5, max_displacement=4, stride1=2, stride2=2, pad_size=2, is_multiply=True, ) grad(out_feat, tensor(F.ones_like(out_feat))) assert make_shape_tuple(data1.grad.shape) == make_shape_tuple(data1.shape) ##test case 1 from https://github.com/NVIDIA/flownet2-pytorch/issues/194 data1, data2 = _gen_correlation(random=False, image_shape=(1, 1, 3, 3)) out_feat = F.vision.correlation( data1, data2, kernel_size=3, max_displacement=0, stride1=1, stride2=1, pad_size=0, is_multiply=True, ) assert abs(out_feat.sum() - 1) < 1e-9 ##test case 2 check same image subduction data1, data2 = _gen_correlation(random=False, image_shape=(1, 1, 3, 3)) out_feat = F.vision.correlation( data1, data2, kernel_size=3, max_displacement=0, stride1=1, stride2=1, pad_size=0, is_multiply=False, ) assert out_feat.sum() < 1e-9 ##test case 3 check same image subduction data1, data2 = _gen_correlation(random=False, image_shape=(1, 1, 3, 3)) out_feat = F.vision.correlation( data1, data2, kernel_size=3, max_displacement=0, stride1=1, stride2=1, pad_size=0, is_multiply=False, ) assert out_feat.sum() < 1e-9 ##test case 4 check correlation data1, _ = _gen_correlation(random=False, image_shape=(1, 1, 220, 220), constant=2.0) _, data2 = _gen_correlation(random=False, image_shape=(1, 1, 220, 220), constant=1.0) out_feat = F.vision.correlation( data1, data2, kernel_size=3, max_displacement=2, stride1=1, stride2=2, pad_size=0, is_multiply=False, ) assert abs(out_feat.mean() - 1) < 1e-9
def forward(self, output, target, epoch=0): flows_fw, flows_bw = output["flow_fw"], output["flow_bw"] flow_pyrs = [ F.concat([flow_fw, flow_bk], 1) for flow_fw, flow_bk in zip(flows_fw, flows_bw) ] img1, img2 = target[:, :3], target[:, 3:] self.pyramid_occu_mask1 = [] self.pyramid_occu_mask2 = [] occu_mask1 = 1 - get_occu_mask_bidirection(flow_pyrs[0][:, :2], flow_pyrs[0][:, 2:]) occu_mask2 = 1 - get_occu_mask_bidirection(flow_pyrs[0][:, 2:], flow_pyrs[0][:, :2]) pyramid_smooth_losses = [] pyramid_warp_losses = [] for i, flow in enumerate(flow_pyrs): b, c, h, w = flow.shape if i == 0: s = min(h, w) if i == 4: pyramid_smooth_losses.append(0) pyramid_warp_losses.append(0) continue img1_rsz = F.vision.interpolate(img1, (h, w)) img2_rsz = F.vision.interpolate(img2, (h, w)) img1_warp = flow_warp(img2_rsz, flow[:, :2]) img2_warp = flow_warp(img1_rsz, flow[:, 2:]) if i != 0: occu_mask1 = F.vision.interpolate(occu_mask1, (h, w)) occu_mask2 = F.vision.interpolate(occu_mask2, (h, w)) self.pyramid_occu_mask1.append(occu_mask1) self.pyramid_occu_mask2.append(occu_mask2) if epoch < 250 and not self.params.fine_tune: occu_mask1 = occu_mask2 = F.ones_like(occu_mask2) photo_loss = self.photo_loss(img1_rsz, img1_warp, occu_mask1) smooth_loss = self.smooth_loss(flow[:, :2] / s, img1_rsz) # backward warping photo_loss += self.photo_loss(img2_rsz, img2_warp, occu_mask2) smooth_loss += self.smooth_loss(flow[:, 2:] / s, img2_rsz) photo_loss /= 2 smooth_loss /= 2 pyramid_smooth_losses.append(photo_loss) pyramid_warp_losses.append(smooth_loss) del photo_loss del smooth_loss _photo_loss = sum(pyramid_smooth_losses) _smooth_loss = 50 * pyramid_warp_losses[0] return _photo_loss + _smooth_loss
def photo_loss_ssim(im_x, im_y, occ_mask=None): if occ_mask is None: occ_mask = F.ones_like(im_x) loss_diff, occ_weight = _weighted_ssim(im_x, im_y, occ_mask) photo_loss = F.sum(loss_diff * occ_weight) / (F.sum(occ_weight) + 1e-6) return photo_loss