def __sparse_loss(sparsity_probability, covariates, model_children, device):
        values = covariates
        loss = 0
        encoder = list(model_children[0].children())
        if len(encoder) == 2:
            # only last encoder layer
            values = model_children[0](values)
        elif len(encoder) == 4:
            # only last encoder layer
            values = encoder[2](encoder[1](encoder[0](values)))

        loss += Utils.KL_divergence(sparsity_probability, values, device)
        return loss