def test_backward_dense(self, batch_size, pooling_factor, pooling_factor_std, tt_ndims): device = torch.device("cuda:0") torch.cuda.set_device(device) tt_p_shapes = [7, 9, 11, 5] tt_q_shapes = [3, 4, 5, 7] tt_ranks = [13, 12, 7] tt_p_shapes = tt_p_shapes[:tt_ndims] tt_q_shapes = tt_q_shapes[:tt_ndims] tt_ranks = tt_ranks[:(tt_ndims - 1)] num_embeddings = np.prod(np.array(tt_p_shapes)) embedding_dim = np.prod(np.array(tt_q_shapes)) _, indices, offsets, _ = generate_sparse_feature( batch_size, num_embeddings=num_embeddings, pooling_factor=float(pooling_factor), pooling_factor_std=float(pooling_factor_std), generate_scores=False, unary=False, unique=False, ) # create TT-Embedding op offsets = torch.tensor(offsets, dtype=torch.int64, device=device) indices = torch.tensor(indices, dtype=torch.int64, device=device) tt_emb = TTEmbeddingBag( num_embeddings=num_embeddings, embedding_dim=embedding_dim, tt_p_shapes=tt_p_shapes, tt_q_shapes=tt_q_shapes, tt_ranks=tt_ranks, sparse=False, weight_dist="uniform", ) tt_emb.to(device) emb = torch.nn.EmbeddingBag( num_embeddings, embedding_dim, sparse=True, mode="sum", _weight=tt_emb.full_weight(), include_last_offset=True, ) emb.to(device) d_output = torch.rand(batch_size, embedding_dim, device=device) * 0.1 tt_cores = [ tt.clone().detach().requires_grad_(True) for tt in tt_emb.tt_cores ] full_weight = tt_matrix_to_full(tt_p_shapes, tt_q_shapes, tt_ranks, tt_cores, [1, 0, 2, 3]) # tt_emb output = tt_emb(indices, offsets) output.backward(d_output) # reference output_ref = emb(indices.long(), offsets.long()) output_ref.backward(d_output) d_weight_ref = emb.weight.grad.to_dense() full_weight.backward(d_weight_ref) for i in range(tt_ndims): torch.testing.assert_allclose(tt_emb.tt_cores[i].grad, tt_cores[i].grad)
def test_backward_adagrad(self, batch_size, pooling_factor, pooling_factor_std, tt_ndims): device = torch.device("cuda:0") torch.cuda.set_device(device) tt_p_shapes = [7, 9, 11, 5] tt_q_shapes = [3, 4, 5, 7] tt_ranks = [13, 12, 7] tt_p_shapes = tt_p_shapes[:tt_ndims] tt_q_shapes = tt_q_shapes[:tt_ndims] tt_ranks = tt_ranks[:(tt_ndims - 1)] num_embeddings = np.prod(np.array(tt_p_shapes)) embedding_dim = np.prod(np.array(tt_q_shapes)) learning_rate = 0.1 eps = 0.0001 _, indices, offsets, _ = generate_sparse_feature( batch_size, num_embeddings=num_embeddings, pooling_factor=float(pooling_factor), pooling_factor_std=float(pooling_factor_std), generate_scores=False, unary=False, unique=False, ) # create TT-Embedding op offsets = torch.tensor(offsets, dtype=torch.int64, device=device) indices = torch.tensor(indices, dtype=torch.int64, device=device) tt_emb = TTEmbeddingBag( num_embeddings=num_embeddings, embedding_dim=embedding_dim, tt_p_shapes=tt_p_shapes, tt_q_shapes=tt_q_shapes, tt_ranks=tt_ranks, sparse=True, optimizer=OptimType.EXACT_ADAGRAD, learning_rate=learning_rate, eps=eps, ) tt_emb.to(device) emb = torch.nn.EmbeddingBag( num_embeddings, embedding_dim, sparse=True, mode="sum", _weight=tt_emb.full_weight(), include_last_offset=True, ) emb.to(device) d_output = torch.rand(batch_size, embedding_dim, device=device) * 0.1 tt_cores = [ tt.clone().detach().requires_grad_(True) for tt in tt_emb.tt_cores ] full_weight = tt_matrix_to_full(tt_p_shapes, tt_q_shapes, tt_ranks, tt_cores, [1, 0, 2, 3]) # tt_emb output = tt_emb(indices, offsets) output.backward(d_output) # reference output_ref = emb(indices.long(), offsets.long()) output_ref.backward(d_output) d_weight_ref = emb.weight.grad.to_dense() full_weight.backward(d_weight_ref) new_optimizer_state = [] new_optimizer_state = [torch.mul(t.grad, t.grad) for t in tt_cores] new_tt_cores = [] new_tt_cores = [ (t - torch.div(t.grad * learning_rate, torch.sqrt(new_optimizer_state[i]) + eps)) for i, t in enumerate(tt_cores) ] for i in range(tt_ndims): torch.testing.assert_allclose(tt_emb.optimizer_state[i], new_optimizer_state[i]) torch.testing.assert_allclose(tt_emb.tt_cores[i], new_tt_cores[i])