Exemple #1
0
 def set_embedding(self, name: str, vector: torch.tensor):
     device = flair.device
     if len(self._embeddings.keys()) > 0:
         device = next(iter(self._embeddings.values())).device
     self._embeddings[name] = vector.to(device, non_blocking=True)
Exemple #2
0
    def marginal_variational_qf_parameters(self, X : torch.tensor, diagonal : bool, is_duvenaud: bool, init_Z : torch.tensor = None) -> torch.tensor:
        """ Marginal Variational posterior q(f) = \int p(f|u) q(u) d_u
            q(f) = int p(f|u) q(u) d_u = N(f|K_xz K_zz_inv m + m_x -K_xz K_zz_inv \mu_z, 
                                                 K_xx -K_xz K_zz_inv K_zx + [K_xz K_zz_inv] S [K_xz K_zz_inv]^T)
                Args:
                        `X`           (torch.tensor)  :->:  input locations where the marginal distribution q(f) is computed. Can hace shape (S*MB,Dx) or (Dy,S*MB,Dx)
                        `diagonal`    (bool)          :->:  If true, return only the diagonal covariance
                        `is_duvenaud` (bool)          :->:  Indicate if we are using duvenaud mean function. Only useful in DGPs
                        `init_Z`      (torch.tensor)  :->:  Only used if is_duvenaud = True. It is used to concatenate the input inducing points to 
                                                            the inducing points at each layer

                Returns:
                        `mu_q_f`      (torch.tensor)  :->:  shape Dy,MB,1
                        `cov_q_f`     (torch.tensor)  :->:  shape (Dy,MB,1) if diagonal else (Dy,MB,MB)
        """
        ## ================================= ##
        ## ===== Pre-Compute Variables ===== ##
        if len(X.shape) == 2:
            X = X.repeat(self.out_dim,1,1) # repeat here as if not this operation will be done twice by the marginal_qf_parameter and likelihood to work batched and multioutput respectively
        assert len(X.shape) == 3, 'Invalid input X.shape' 

        Dy,MB,M  = self.out_dim,X.size(1),self.M
        Z        = self.Z

        kernel   = self.covariance_function
        mean     = self.mean_function

        if self.Z_is_shared:
            # In this case this repeat is not particulary needed because the kernel will repeat Z
            # when doing forward both if batch_shape is out_dim or is 1 (self.kernel_is_shared True)
            # Keep it explicitely for better understanding of the code.
            Z = Z.repeat(self.out_dim,1,1) 

        # Concatenate inducing points if is duvenaud
        if is_duvenaud:
            #z_concat = X[0,0:self.M,-1].view(self.M,1)
            init_Z    = init_Z.view(1,self.M,-1).repeat(self.out_dim,1,1)
            Z = torch.cat((Z,init_Z),2)

        K_xx = kernel(X,are_equal = True, diag = diagonal)
        mu_x = gpy.lazy.delazify( mean(X) ).view(Dy, MB, 1)

        K_zz = kernel(Z,are_equal = False).evaluate()
        mu_z = gpy.lazy.delazify( mean(Z) ).view(Dy, M , 1)

        K_xz = kernel(X,Z,are_equal = False).evaluate()

        # stabilize K_xz. In case Z = X we should add jitter if psd_safe_cholesky adds jitter to K_zz
        # jitter can only be added to square matrices

        K_zx = torch.transpose(K_xz,1,2) # pre-compute the transpose as it is required several times

        # cholesky from K_zz
        L_zz, K_zz  = psd_safe_cholesky(K_zz, upper = False, jitter = cg.global_jitter) # The K_zz returned is that with noise

        if self.is_whiten:
            L_zz_t = L_zz.transpose(1,2) 

        # variational distribution
        q_U    = self.q_U
        m_q_U  = q_U.variational_mean
        K_q_U  = q_U.chol_variational_covar
        
        lower_mask = torch.ones(K_q_U.shape[-2:], dtype=cg.dtype, device=cg.device).tril(0)
        L_q_U = K_q_U.mul(lower_mask)
        K_q_U = torch.matmul( L_q_U,L_q_U.transpose(1,2) )
        m_q_U  = m_q_U.view(Dy,M,-1)

        ## =================== ##
        ## ==== mean q(f) ==== ##

        if self.is_whiten:
            # mu_qf = K_{xz}[L_{zz}^T]^{-1}m_0+\mu_x
            sol,_ = torch.triangular_solve(m_q_U, L_zz_t, upper = True)
            mu_q_f = torch.matmul(K_xz,sol) + mu_x

        else:
            # mu_qf = K_xz K_zz_inv( m - mu_Z) + m_x
            lhs = torch.cholesky_solve(m_q_U-mu_z, L_zz, upper = False)
            mu_q_f = torch.matmul(K_xz,lhs) + mu_x

        
        ## ========================= ##
        ## ==== covariance q(f) ==== ##
        ## Note:
            # To compute the diagonal q(f) we perform the following identity. Here @ indicates matrix product and .* element-wise product
            # For K_xz @ K_zz_inv @ K_zx the diagonal is:
            #   sum(K_zx .* [K_zz_inv @ K_zx],0)
            # This means that the identity can be written down as:
            #  A @ B @ A^T = A^T .* [ B @ A^T ]							
            # For the covariance note that: [K_xz K_zz_inv] S [K_xz K_zz_inv]^T = [K_zz_inv K_zx]^T S [K_zz_inv K_zx] =
            # where the output of the linear solver is sol = [K_zz_inv K_zx]. So we have: sol^T S sol. Thus we have: sum(sol.*[S @ sol],0) to compute the diagonal
            # note that as the operations are batched we have to reduce dimension 1 instead of dimension 0. Also use matmul to perform the batched operation.

        # sol = K_zz^{-1}@K_zx
        sol = torch.cholesky_solve(K_zx, L_zz, upper = False)

        if self.is_whiten:
            # cov_qf = K_{xx} -K_{xz} K_{zz}^{-1} K_{zx} + K_{xz} {L_{zz}^T}^{-1} S L_{zz}^{-1}K_{zx} 
            rhs,_ = torch.triangular_solve(K_zx, L_zz, upper = False)
            if diagonal:
                cov_q_f = K_xx - torch.sum(torch.mul(K_zx,sol),1) + torch.sum(torch.mul(rhs,torch.matmul(K_q_U,rhs)),1)
            else:
                cov_q_f = K_xx - torch.matmul(K_xz,sol) + torch.matmul(torch.matmul(torch.transpose(rhs,1,2),K_q_U),rhs)

        else:
            # cov_qf = K_{xx} -K_{xz} K_{zz}^{-1} K_{zx} + [K_{xz} K_{zz}^{-1}] S [K_{xz} K_{zz}^{-1}]^T 
            if diagonal:
                cov_q_f = K_xx - torch.sum(torch.mul(K_zx,sol),1) + torch.sum(torch.mul(sol,torch.matmul(K_q_U,sol)),1)
            else:
                cov_q_f = K_xx - torch.matmul(K_xz,sol) + torch.matmul(torch.matmul(torch.transpose(sol,1,2),K_q_U),sol)

        if diagonal:
            cov_q_f = torch.unsqueeze(cov_q_f,2)

        return mu_q_f, cov_q_f
Exemple #3
0
    def test_log_likelihood(self, X: torch.tensor, Y:torch.tensor, return_moments:bool ,Y_std: float, S_MC_NNet: int = None) -> torch.tensor:
        """ Computes Predictive Log Likelihood 
                \log p(Y*|X*) = \log \int p(y*|G(f*),C_y) q(f*,f|u) q(u) df*,df,du 
                   -> We take diagonal of C_Y as samples are assumed to be i.i.d
                   -> Integration can be approximated either with Monte Carlo or with quadrature. This function uses quadrature.
                
                Args:
                        `X`                 (torch.tensor) :->: Input locations. Shape (MB,Dx) or shape (Dy,MB,Dx)
                        `Y`                 (torch.tensor) :->: Ground truth labels. Shape (MB,Dy)
                        `return_moments`    (bool)         :->: If true, then return the moments 1 and 2 from the predictive distribution.
                        `Y_std`             (float)        :->: Standard desviation of your regressed variable. Used to re-scale output.
                        `S_MC_NNet`         (int)          :->: Number of samples from the dropout distribution is fully_bayesian is true

                Returns:
                        `log_p_y`           (torch.tensor) :->: Log probability of each of the outpus with a tensor of shape (Dy,)
                        `predictive_params` (list)         :->: if return_moments True then returns a list with mean and variance from the predictive distribution. This is done in this funciton
                                                                because for some test log likelihood we need to compute the predictive. Hence support is given for any likelihood. Moments have shape
                                                                (Dy,MB,1)
        """
        MB = X.size(0)
        Dx = X.size(1)
        Dy = self.out_dim
        
        X_run  = X  # didnt realized the rest of function used X_run, so it is easier to do it here.
        if len(X_run.shape) == 2:
            X_run = X_run.repeat(self.out_dim,1,1) 
        assert len(X_run.shape) == 3, 'Invalid input X.shape'

        self.eval() # set parameters for eval mode. Batch normalization, dropout etc
        if self.fully_bayesian:
            # activate dropout if required
            is_dropout = enable_eval_dropout(self.modules())
            assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode"
            assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" 

        with torch.no_grad():

            ## ================================================ ##
            ## =========== GAUSSIAN LIKELIHOOOD =============== ##
            ## == with non linear mean
            if isinstance(self.likelihood,GaussianNonLinearMean):
                # retrieve the noise and expand
                log_var_noise = self.likelihood.log_var_noise
                if self.likelihood.noise_is_shared:
                    log_var_noise = self.likelihood.log_var_noise.expand(Dy,1)

                ## ================================================== ##
                ## === Compute moments of predictive distribution === ##
                #  In this model this is not necessary to compute log likelihood.
                #  However, we give the option of returning this parameters to be consistent
                #  with the standard GP.
                predictive_params = None
                if return_moments:
                    m1,m2, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True, S_MC_NNet = S_MC_NNet)
                    predictive_params = [m1,m2]
                else:
                    mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X_run, diagonal = True, is_duvenaud = False, init_Z = None)
                mean_q_f, cov_q_f = mean_q_f.squeeze(dim = -1),cov_q_f.squeeze(dim = -1)

                self.eval()
                if self.fully_bayesian:
                    ## Call again self.eval() as self.predictive_distribution call self.train() before return
                    is_dropout = enable_eval_dropout(self.modules())
                    assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode"

                ## Common functions used by bayesian and non bayesian flows
                def get_quad_weights_shifted_locations(mean_q_f,cov_q_f):
                    ## Get the quadrature points and the weights
                    locations = self.likelihood.quadrature_distribution.locations
                    locations = _pad_with_singletons(locations, num_singletons_before=0, num_singletons_after = mean_q_f.dim())
                    shifted_locs = torch.sqrt(2.0 * cov_q_f) * locations + mean_q_f # Shape (S_quad,Dy,S,MB)

                    weights = self.likelihood.quadrature_distribution.weights
                    weights = _pad_with_singletons(weights, num_singletons_before=0, num_singletons_after = shifted_locs.dim() - 1) # Shape (S_quad,1,1,1)

                    return shifted_locs, weights

                def compute_log_lik(Y,Y_std,shifted_locs,C_Y):
                    ## Re-scale by Y_std same as what other people does to compare in UCI
                    Y   = Y_std*Y
                    m_Y = Y_std*shifted_locs
                    C_Y = (Y_std*torch.sqrt(C_Y))**2

                    log_p_y = batched_log_Gaussian( Y, m_Y, C_Y, diagonal = True, cov_is_inverse = False) # (S_quad,Dy,S_MC,MB)
                    
                    return log_p_y

                S_MC_NNet = 1 if not self.fully_bayesian else S_MC_NNet # Note that the estimator is the same for input dependent and Bayesian. Just need to expand or not this dimension
                                                                        
                S_quad = self.quad_points 
                G_mat  = self.G_matrix

                # noise retrieve and reshape
                C_Y = torch.exp(log_var_noise).expand(-1,MB).view(Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1)) # (Squad,Dy,S_MC_NNet,MB,1). Add extra dimension 1 so that we can compute 
                                                                                                                  #                           likelihood using batched_log_gaussian function    
                # observation reshape
                Y = Y.t().view(1,Dy,1,MB,1).repeat((S_quad,1,S_MC_NNet,1,1))   # S,Dy,S_MC_NNet,MB,1

                # Y_std reshape
                Y_std = Y_std.view(1,Dy,1,1,1).repeat(S_quad,1,S_MC_NNet,MB,1) # S,Dy,S_MC_NNet,MB,1

                # this operation could be done by repeating X and computing mean_q_f as in DGP but is not necessary to do extra computation here as X is constant: just repeat. 
                mean_q_f, cov_q_f = mean_q_f.unsqueeze(dim = 1),cov_q_f.unsqueeze(dim = 1) # Remove last dimension, so that we can warp. We add it later for batched_log_lik
                mean_q_f = mean_q_f.repeat(1,S_MC_NNet,1) # (Dy,S_MC_NNet,MB)
                cov_q_f  = cov_q_f.repeat(1,S_MC_NNet,1)

                ## =================================== ##
                ## === Compute test log likelihood === ##
                shifted_locs, weights =  get_quad_weights_shifted_locations(mean_q_f,cov_q_f)

                ## Warp quadrature points
                # expand X to perform MC dropout over NNets parameters
                X_run = X_run.unsqueeze(dim = 1).repeat(1,S_MC_NNet,1,1) # Just add one extra dimension. No need for repeat for S_quad as pytorch automatically broadcasts. 
                                                                         # It is important to repeat over S_MC_NNet. In this way each forward thorugh X computes a different 
                                                                         # MC for the flow parameters. Otherwise pytorch would broadcast S_MC_NNet as well hence we would only 
                                                                         # be using one sample from the posterior over W.
                for idx,fl in enumerate(G_mat):
                     shifted_locs[:,idx,:,:] = fl(shifted_locs[:,idx,:,:],X_run[idx]) # (S_quad,Dy,S_MC_NNet,MB)

                shifted_locs = shifted_locs.view(S_quad,Dy,S_MC_NNet,MB,1) # shape (S_quad,Dy,S,MB,1)

                log_p_y = compute_log_lik(Y,Y_std,shifted_locs,C_Y)

                if self.fully_bayesian: # the only difference between bayesian and the rest is here, where we perform a double integration for this case

                    # Reduce with double logsumexp operation. Check estimator here: @TODO: add link once we releasea github
                    reduce_lse = torch.log(weights)  + log_p_y
                    log_p_y = torch.logsumexp( torch.logsumexp(reduce_lse, dim = 0) -0.5*torch.log(cg.pi) ,dim = 1).sum(1) - MB*numpy.log(S_MC_NNet)
                else:
                    # Note that we just need to remove the extra dimension we added for using the same code
                    log_p_y = log_p_y.squeeze(dim = 2)
                    weights = weights.squeeze(dim = 2)
        
                    ## Reduce log ws + log_p_y_s using logsumexp trick. Also reduce MB and add the constant
                    reduce_lse = torch.log(weights) + log_p_y
                    log_p_y = (torch.logsumexp(reduce_lse, dim = 0)).sum(-1) - 0.5*MB*torch.log(cg.pi)

            ## ===================
            ## == with linear mean
            elif isinstance(self.likelihood,GaussianLinearMean):
                ## ================================================== ##
                ## === Compute moments of predictive distribution === ##
                m_Y,K_Y, mean_q_f, cov_q_f = self.predictive_distribution(X_run, diagonal = True)

                ## =================================== ##
                ## === Compute test log likelihood === ##
                # Re-scale Y_std
                Y = Y.t() # (Dy,MB)
                Y_std = Y_std.view(self.out_dim,1) # (Dy,1)

                log_p_y = batched_log_Gaussian( obs = Y_std*Y, mean = Y_std*m_Y, cov = (Y_std*torch.sqrt(K_Y))**2, diagonal = True, cov_is_inverse = False)

                predictive_params = None
                if return_moments:
                    predictive_params = [m_Y,K_Y]

            ## =============================================================== ##
            ## ============ BERNOULLI/CATEGORICAL LIKELIHOOOD ================ ##
            elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli):

                # as we cant do exact integration here either we warp or we dont the proceedure is very similar to GP classification. The only difference is of
                # binary classification with Gauss CDF link function
                m_Y, _, mean_q_f, cov_q_f = self.predictive_distribution(X_run,diagonal = True, S_MC_NNet = S_MC_NNet)

                check = torch.logical_not(torch.isfinite(m_Y)).float()
                assert check.sum() == 0.0, "Got saturated probabilities"

                if isinstance(self.likelihood,Bernoulli): # turn the vector as if it became from the MulticlassCategorical so that this is transparent to the trainer
                    m_Y     = m_Y.squeeze() 
                    neg_m_Y = 1.0-m_Y # compute the probability of class 0
                    m_Y     = torch.stack((neg_m_Y,m_Y),dim = 1) 

                _, _ , _ , log_p_y = compute_calibration_measures(m_Y.float() ,Y ,apply_softmax = False ,bins = 15)  

                log_p_y = -1*((log_p_y*MB).sum()) # the compute_calibration_measures returns log_p_y.mean(), hence we remove that by multiplying by MB and then summing up

                predictive_params = None
                if return_moments:
                    predictive_params = [m_Y]

            else:
                raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self)))

        self.train() # set parameters for train mode. Batch normalization, dropout etc
        return log_p_y, predictive_params
 def cube(self,tensor:torch.tensor):
     return tensor.mul(tensor.mul(tensor))
def label_smoothing(y: torch.tensor, alpha: float) -> torch.tensor:
    return y.float() * (1 - alpha) + 0.5 * alpha
Exemple #6
0
 def valid_loss_compute(self, x: torch.tensor, y: torch.tensor, norm: int):
     x = self.model.generator(x)
     loss = self.labelsmooth(x.contiguous().view(-1, x.size(-1)),
                             y.contiguous().view(-1)) / norm
     return loss.item() * norm
 def forward(self, x: torch.tensor):
     mean = x.mean(-1, keepdim=True)
     std = x.std(-1, keepdim=True)
     return self.a_2 * (x - mean) / (std + self.eps) + self.b_2
Exemple #8
0
    def compressed_allreduce(self,
                             buffer_m: torch.tensor,
                             worker_error,
                             server_error,
                             local_rank):

        # all_start_time = time.time()
        original_shape = buffer_m.size()
        if len(original_shape) > 1:
            buffer_m = torch.flatten(buffer_m)
        original_size = buffer_m.numel()
        worker_error_size = worker_error.numel()
        cupy.cuda.Device(local_rank).use()

        if original_size != worker_error_size:
            empty_tensor = torch.zeros(worker_error_size - original_size,
                                       device=buffer_m.device)
            buffer_m = torch.cat([buffer_m, empty_tensor])

        buffer_m.add_(worker_error)
        worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
        worker_error.set_(buffer_m - worker_scale *
                          buffer_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        if self.bool_not_supported:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    buffer_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                self.size)
        else:
            cupy_sign_list_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(buffer_m.sign_().add_(1).bool()),
                self.size)
        cupy_worker_scale = self.compression_backend.torch2cupy(worker_scale)

        cupy_recvbuf_sign = cupy.zeros(
            [self.size,
             cupy_sign_list_packed[self.rank].size],
            dtype=cupy_sign_list_packed[0].dtype)
        # cupy_recvbuf_scale = cupy.zeros([self.size, 1], dtype=cupy_worker_scale.dtype)

        sign_list_packed = [
            self.compression_backend.cupy2torch(cupy_sign_list_packed[idx])
            for idx in range(self.size)
        ]

        # worker_scale = self.compression_backend.cupy2torch(cupy_worker_scale)
        recvbuf_sign = self.compression_backend.cupy2torch(cupy_recvbuf_sign)
        #recvbuf_scale = self.compression_backend.cupy2torch(cupy_recvbuf_scale)
        recvbuf_scale = [
            torch.zeros(1,
                        dtype=worker_scale.dtype,
                        device=torch.device(local_rank)) for i in range(self.size)
        ]

        # communication phase 1
        # gather_start = time.time()
        # Alltoall for sign
        dist.all_to_all_single(recvbuf_sign,
                               torch.stack(sign_list_packed),
                               group=self.world_group)
        # Allgather for scale
        dist.all_gather(recvbuf_scale, worker_scale, group=self.world_group)

        # gather_end = time.time()

        # cupy_sign_list_packed, sign_list_packed, cupy_worker_scale, worker_scale = None, None, None, None
        cupy_sign_list_packed = None

        cupy_recvbuf_sign = self.compression_backend.torch2cupy(recvbuf_sign)
        #cupy_recvbuf_scale = self.compression_backend.torch2cupy(torch.stack(recvbuf_scale))

        compensated_server_m = self.compression_backend.cupy2torch(
            (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
                self.size,
                -1)).float().add_(-0.5).mul_(2.0).mul_(
                    torch.stack(recvbuf_scale).mul_(1 / self.size)).sum(0)
        compensated_server_m.add_(server_error)
        server_scale = torch.norm(compensated_server_m) / np.sqrt(
            compensated_server_m.numel())
        server_error.set_(
            compensated_server_m - server_scale *
            compensated_server_m.sign().add_(1).bool().float().add_(-0.5).mul_(2.0))

        # cupy_server_scale = self.compression_backend.torch2cupy(server_scale)

        if self.bool_not_supported:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool().to(dtype=torch.uint8)),
                1)
        else:
            cupy_server_sign_packed = self.compression_backend.compress_by_chunk(
                self.compression_backend.torch2cupy(
                    compensated_server_m.sign_().add_(1).bool()),
                1)
        compensated_server_m = None

        cupy_recvbuf_sign_server = cupy.zeros(
            [self.size,
             cupy_server_sign_packed[0].size],
            dtype=cupy_recvbuf_sign.dtype)
        # cupy_recvbuf_sign, recvbuf_sign = None, None
        cupy_recvbuf_sign = None

        server_sign_packed = [
            self.compression_backend.cupy2torch(cupy_server_sign_packed[0])
        ]
        recvbuf_sign_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_sign_server[idx])
            for idx in range(self.size)
        ]

        # server_scale = self.compression_backend.cupy2torch(cupy_server_scale)
        cupy_recvbuf_scale_server = cupy.zeros([self.size,
                                                1],
                                               dtype=cupy_worker_scale.dtype)
        # cupy_recvbuf_scale, recvbuf_scale = None, None

        recvbuf_scale_server = [
            self.compression_backend.cupy2torch(cupy_recvbuf_scale_server[idx])
            for idx in range(self.size)
        ]

        # Communication Phase 2
        dist.all_gather(recvbuf_sign_server,
                        server_sign_packed[0],
                        group=self.world_group)
        dist.all_gather(recvbuf_scale_server, server_scale, group=self.world_group)

        cupy_server_sign_packed = None

        # need to convert from a tensor list to a single tensor
        # dist.all_gather only provides a tensor list as the recv/output buffer
        recvbuf_sign_server = torch.stack(recvbuf_sign_server)

        cupy_recvbuf_sign_server = self.compression_backend.torch2cupy(
            recvbuf_sign_server)

        buffer_m.data.copy_(
            self.compression_backend.cupy2torch(
                (cupy.unpackbits(cupy_recvbuf_sign_server.flatten())).reshape(
                    self.size,
                    -1)).float().add_(-0.5).mul_(2.0).mul_(
                        self.compression_backend.cupy2torch(
                            cupy_recvbuf_scale_server)).flatten().data)
        if original_size != worker_error_size:
            buffer_m = buffer_m[0:original_size]
        if len(original_shape) > 1:
            buffer_m = buffer_m.reshape(original_shape)

        return buffer_m
def form_real(number: torch.tensor) -> str:
    return "{:.3f}".format(number.item())
 def clipping_weight(self, weight: torch.tensor) -> torch.tensor:
     with torch.set_grad_enabled(False):
         weight = torch.clamp(weight, -1, 1)
     weight.requires_grad = True
     return weight
def conditional_to_cuda(x: torch.tensor,
                        non_blocking: bool = False) -> torch.tensor:
    #print(x.cuda.__doc__)
    #return x.cuda(non_blocking=non_blocking) if args.gpu_count > 0 else x
    return x.cuda() if args.gpu_count > 0 else x
 def deterministic(weight: torch.tensor) -> torch.tensor:
     return weight.sign()
Exemple #13
0
    def _hook_properties(hook_self, tensor_type: torch.tensor):
        """Overloads tensor_type properties

           Parameters: tensor_type: Torch tensor

        """
        @property
        def child(self):
            try:
                try:
                    assert self._child is not None
                    return self._child
                except (AttributeError, AssertionError):
                    self._child = _LocalTensor(child=self,
                                               parent=self,
                                               torch_type=type(self).__name__)
                    return self._child
            except TypeError:
                # for some reason, hasattr(self, '_child') returns a TypeError saying
                # "TypeError: 'NoneType' object is not callable". It's supposed to only
                # return False and I can't get to the bottom of it. So, for now, I'm
                # going to break a personal rule and use try/catch for logic, but
                # this is merely supposed to evaluate whether self has ._child as an
                # attribute. Note this only seems to happen when self is a
                # torch.autograd.Variable

                self._child = _LocalTensor(child=self,
                                           parent=self,
                                           torch_type=type(self).__name__)
                return self._child

        @child.setter
        def child(self, value):
            self._child = value

        tensor_type.child = child

        @property
        def id(self):
            return self.child.id

        # TODO: this should not be possible, but it should also be possible to define a FloatTensor
        # with a specific id. This is in theory possible, but it doesnt seem to work in practice

        @id.setter
        def id(self, new_id):
            self.child.id = new_id
            return self

        tensor_type.id = id

        @property
        def location(self):
            return self.child.location

        tensor_type.location = location

        @property
        def id_at_location(self):
            return self.child.id_at_location

        tensor_type.id_at_location = id_at_location

        @property
        def owner(self):
            return self.child.owner

        tensor_type.owner = owner
Exemple #14
0
def flatten(t:torch.tensor):
    t = t.reshape((1,-1))
    t = t.squeeze()
    print('after flatten:',t)
    return
def flatten_conv(conv_input: tensor) -> tensor:
    batch_size = list(conv_input.size())[0]
    return conv_input.view(batch_size, -1)
Exemple #16
0
 def forward(self, input: torch.tensor) -> torch.tensor:
     assert len(input.size(
     )) == 3, 'The number of dimensions of input tensor must be 3!'
     # reflect padding to match lengths of in/out
     input = F.pad(input, (1, 0), 'reflect')
     return F.conv1d(input, self.flipped_filter)
def to_np(t: torch.tensor) -> np.array:
    """Converts a PyTorch tensor to a Numpy array.
    """
    return t.cpu().detach().numpy()
Exemple #18
0
 def forward(self, input: torch.tensor) -> torch.tensor:
     x, _ = self.rnn(input.transpose(1, 2))
     return x.transpose(1, 2)
 def forward(self, x: torch.tensor):
     x = x + self.pe[:, :x.size(1)]
     return self.dropout(x)
Exemple #20
0
 def mc_tensor(input: torch.tensor, k: int):
     mc_shape = [input.shape[0], k] + list(input.shape[1:])
     return input.unsqueeze(1).expand(mc_shape).flatten(0, 1)
Exemple #21
0
 def acc_mean_from_confusion_matrix(self, cm: torch.tensor):
     cm[0] = cm[0:4].sum(dim=0)
     cm[1] = cm[4]
     cm = cm / torch.sum(cm, dim=1, keepdim=True)
     print(cm.diag())
     return cm.diag().mean()
Exemple #22
0
    def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor):
        """
        One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
        and possibly a parameter update (depending on the gradient accumulation).

        Input:
        ------
        input_ids: `torch.tensor(bs, seq_length)` - The token ids.
        attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
        """
        if self.mlm:
            s_logits, s_hidden_states = self.student(
                input_ids=input_ids, attention_mask=attention_mask
            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
                t_logits, t_hidden_states = self.teacher(
                    input_ids=input_ids, attention_mask=attention_mask
                )  # (bs, seq_length, voc_size)
        else:
            s_logits, _, s_hidden_states = self.student(
                input_ids=input_ids, attention_mask=None
            )  # (bs, seq_length, voc_size)
            with torch.no_grad():
                t_logits, _, t_hidden_states = self.teacher(
                    input_ids=input_ids, attention_mask=None
                )  # (bs, seq_length, voc_size)
        assert s_logits.size() == t_logits.size()

        # https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
        # https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
        if self.params.restrict_ce_to_mask:
            mask = (lm_labels > -1).unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        else:
            mask = attention_mask.unsqueeze(-1).expand_as(s_logits)  # (bs, seq_lenth, voc_size)
        s_logits_slct = torch.masked_select(s_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        s_logits_slct = s_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        t_logits_slct = torch.masked_select(t_logits, mask)  # (bs * seq_length * voc_size) modulo the 1s in mask
        t_logits_slct = t_logits_slct.view(-1, s_logits.size(-1))  # (bs * seq_length, voc_size) modulo the 1s in mask
        assert t_logits_slct.size() == s_logits_slct.size()

        loss_ce = (
            self.ce_loss_fct(
                F.log_softmax(s_logits_slct / self.temperature, dim=-1),
                F.softmax(t_logits_slct / self.temperature, dim=-1),
            )
            * (self.temperature) ** 2
        )
        loss = self.alpha_ce * loss_ce

        if self.alpha_mlm > 0.0:
            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
            loss += self.alpha_mlm * loss_mlm
        if self.alpha_clm > 0.0:
            shift_logits = s_logits[..., :-1, :].contiguous()
            shift_labels = lm_labels[..., 1:].contiguous()
            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss += self.alpha_clm * loss_clm

        if self.alpha_mse > 0.0:
            loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct) / s_logits_slct.size(
                0
            )  # Reproducing batchmean reduction
            loss += self.alpha_mse * loss_mse
        if self.alpha_cos > 0.0:
            s_hidden_states = s_hidden_states[-1]  # (bs, seq_length, dim)
            t_hidden_states = t_hidden_states[-1]  # (bs, seq_length, dim)
            mask = attention_mask.unsqueeze(-1).expand_as(s_hidden_states)  # (bs, seq_length, dim)
            assert s_hidden_states.size() == t_hidden_states.size()
            dim = s_hidden_states.size(-1)

            s_hidden_states_slct = torch.masked_select(s_hidden_states, mask)  # (bs * seq_length * dim)
            s_hidden_states_slct = s_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)
            t_hidden_states_slct = torch.masked_select(t_hidden_states, mask)  # (bs * seq_length * dim)
            t_hidden_states_slct = t_hidden_states_slct.view(-1, dim)  # (bs * seq_length, dim)

            target = s_hidden_states_slct.new(s_hidden_states_slct.size(0)).fill_(1)  # (bs * seq_length,)
            loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target)
            loss += self.alpha_cos * loss_cos

        self.total_loss_epoch += loss.item()
        self.last_loss = loss.item()
        self.last_loss_ce = loss_ce.item()
        if self.alpha_mlm > 0.0:
            self.last_loss_mlm = loss_mlm.item()
        if self.alpha_clm > 0.0:
            self.last_loss_clm = loss_clm.item()
        if self.alpha_mse > 0.0:
            self.last_loss_mse = loss_mse.item()
        if self.alpha_cos > 0.0:
            self.last_loss_cos = loss_cos.item()

        self.optimize(loss)

        self.n_sequences_epoch += input_ids.size(0)
def to_numpy(torch_tensor: torch.tensor) -> np.array:
    return torch_tensor.cpu().detach().numpy()
Exemple #24
0
    def __init__(
        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
    ):
        logger.info("Initializing Distiller")
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
        self.fp16 = params.fp16

        self.student = student
        self.teacher = teacher

        self.student_config = student.config
        self.vocab_size = student.config.vocab_size

        if params.n_gpu <= 1:
            sampler = RandomSampler(dataset)
        else:
            sampler = DistributedSampler(dataset)

        if params.group_by_size:
            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)

        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
        assert self.temperature > 0.0

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos

        self.mlm = params.mlm
        if self.mlm:
            logger.info(f"Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
            logger.info(f"Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sequences_epoch = 0
        self.total_loss_epoch = 0
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
        if self.alpha_mse > 0.0:
            self.last_loss_mse = 0
        if self.alpha_cos > 0.0:
            self.last_loss_cos = 0
        self.last_log = 0

        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        if self.alpha_mse > 0.0:
            self.mse_loss_fct = nn.MSELoss(reduction="sum")
        if self.alpha_cos > 0.0:
            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")

        logger.info("--- Initializing model optimizer")
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = (
            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
        )

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": params.weight_decay,
            },
            {
                "params": [
                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": 0.0,
            },
        ]
        logger.info(
            "------ Number of trainable parameters (student): %i"
            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
        )
        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
        self.optimizer = AdamW(
            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
        )

        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
        )

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
            self.student, self.optimizer = amp.initialize(
                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
            )
            self.teacher = self.teacher.half()

        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel

                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel

                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(
                    self.student,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    find_unused_parameters=True,
                )

        self.is_master = params.is_master
        if self.is_master:
            logger.info("--- Initializing Tensorboard")
            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
Exemple #25
0
 def forward(self, input: torch.tensor):
     return input.view(input.size(0), -1)
Exemple #26
0
def compute_avg(inp: List, nums: torch.tensor) -> float:
    "Computes average given list of torch.tensor and numbers corresponding to them"
    return (torch.stack(inp) * nums).sum() / nums.sum()
Exemple #27
0
    def predictive_distribution(self,X: torch.tensor, diagonal: bool=True, S_MC_NNet: int = None)-> list:
        """ This function computes the moments 1 and 2 from the predictive distribution. 
            It also returns the posterior mean and covariance over latent functions.

            p(Y*|X*) = \int p(y*|G(f*)) q(f*,f|u) q(u) df*,df,du
   
                # Homoceodastic Gaussian observation model p(y|f)
                # GP variational distribution q(f)
                # G() represents a non-linear transformation

                Args:
                        `X`                (torch.tensor)  :->: input locations where the predictive is computed. Can have shape (MB,Dx) or (Dy,MB,Dx)
                        `diagonal`         (bool)          :->: if true, samples are drawn independently. For the moment is always true.
                        `S_MC_NNet`        (int)           :->: Number of samples from the dropout distribution is fully_bayesian is true

                Returns:
                        `m1`       (torch.tensor)  :->:  Predictive mean with shape (Dy,MB)
                        `m2`       (torch.tensor)  :->:  Predictive variance with shape (Dy,MB). Takes None for classification likelihoods
                        `mean_q_f` (torch.tensor)  :->:  Posterior mean of q(f) with shape (Dy,MB,1) [same shape as returned by marginal_variational_qf]
                        `cov_q_f`  (torch.tensor)  :->:  Posterior covariance of q(f) with shape (Dy,MB,1) [same shape as returned by marginal_variational_qf]

        """
        if len(X.shape) == 2:
            X = X.repeat(self.out_dim,1,1)
        assert len(X.shape) == 3, "Bad input specificaton"

        self.eval() # set parameters for eval mode. Batch normalization, dropout etc
        if self.fully_bayesian:
            # activate dropout if required
            is_dropout = enable_eval_dropout(self.modules())
            assert is_dropout, "You set the model to work on fully bayesian but there are no dropout layers in your model. I assert this error as otherwise the the code will work in non_bayesian operating mode"

            assert S_MC_NNet is not None, "The default parameter S_MC_NNet is not provided and set to default None, which is invalid for self.be_bayesian" 

        with torch.no_grad():
            if not diagonal:
                raise NotImplemented("This function does not support returning the predictive distribution with correlations")

            mean_q_f, cov_q_f = self.marginal_variational_qf_parameters(X, diagonal = True, is_duvenaud = False, init_Z = None)

            if self.fully_bayesian: # @NOTE: this has not been refactored as with the rest of the code. But note that we could do both point estimate and bayesian by setting S_MC_NNet = 1 for the non
                                    #  bayesian case.
                # If it is fully Bayesian then do it as in the DGP with flows in the output layer
                Dy,MB,_ = mean_q_f.shape

                # 1. Reshape mean_q_f and cov_q_f to shape (Dy,S_MC_NNet*MB)
                mean_q_f_run = mean_q_f.view(Dy,MB).repeat(1,S_MC_NNet)
                cov_q_f_run  = cov_q_f.view(Dy,MB).repeat(1,S_MC_NNet)

                # 2. Compute moments of each of the montecarlos. Just need to provide X extended to S_MC so that each forward computes a monte carlo
                X = X.repeat(1,S_MC_NNet,1) # expand to shape (Dy,S*MB,Dx). 
                MOMENTS = self.likelihood.marginal_moments(mean_q_f_run, cov_q_f_run, self.G_matrix, X) # get the moments of each S*MB samples

                # 3. Compute the moments from the full predictive distribution, e.g the mixture of Gaussians for Gaussian Likelihood
                if isinstance(self.likelihood,GaussianNonLinearMean):
                    m_Y,C_Y = MOMENTS
                    m_Y = m_Y.view(Dy,S_MC_NNet,MB)
                    C_Y = C_Y.view(Dy,S_MC_NNet,MB)

                    m1 = m_Y.mean(1)
                    m2 = ( C_Y + m_Y**2 ).mean(1) - m1**2 # var = 1/S * sum[K_Y + mu_y^2 ] -[1/S sum m1]^2

                elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood,Bernoulli):
                    m1,m2 = MOMENTS,None
                        
                    m1 = m1.view(S_MC_NNet,MB,Dy)
                    m1 = m1.mean(0) # reduce the monte carlo dimension

                else:
                    raise ValueError("Unsupported likelihood [{}] for class [{}]".format(type(self.likelihood),type(self)))

            else:

                MOMENTS = self.likelihood.marginal_moments(mean_q_f.squeeze(dim = 2), cov_q_f.squeeze(dim = 2), diagonal = True, flow = self.G_matrix, X = X) # diagonal True always. Is an element only used by the sparse_MF_GP with SVI. Diag = False is used by standard GP's marginal likelihood

                if isinstance(self.likelihood,GaussianLinearMean) or isinstance(self.likelihood,GaussianNonLinearMean):
                    m1,m2 = MOMENTS 
                elif isinstance(self.likelihood,MulticlassCategorical) or isinstance(self.likelihood, Bernoulli):
                    m1,m2 = MOMENTS,None

        self.train() # switch back to train mode. 
        return m1,m2, mean_q_f, cov_q_f
Exemple #28
0
def label_to_onehot(labels: torch.tensor, n_classes: int) -> torch.tensor:

    onehot = torch.nn.functional.one_hot(labels.type(torch.int64).flatten()).type(torch.float64)

    return onehot
Exemple #29
0
 def cov(x: torch.tensor) -> torch.tensor:
     x = x - torch.mean(x, dim=1, keepdim=True)
     return (1. / (x.size(1) - 1)) * x.matmul(x.t())
Exemple #30
0
def tensor2list(cudatensor: torch.tensor) -> List:
    return list(cudatensor.cpu().numpy())