Exemple #1
0
    def trace(self, maxIter=100, tol=1e-3):
        """
        compute the trace of hessian using Hutchinson's method
        maxIter: maximum iterations used to compute trace
        tol: the relative tolerance
        """

        device = self.device
        trace_vhv = []
        trace = 0.

        for i in range(maxIter):
            self.model.zero_grad()
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]
            # generate Rademacher random variables
            for v_i in v:
                v_i[v_i == 0] = -1

            if self.full_dataset:
                _, Hv = self.dataloader_hv_product(v)
            else:
                Hv = hessian_vector_product(self.gradsH, self.params, v)
            trace_vhv.append(group_product(Hv, v).cpu().item())
            if abs(np.mean(trace_vhv) - trace) / (trace + 1e-6) < tol:
                return trace_vhv
            else:
                trace = np.mean(trace_vhv)

        return trace_vhv
Exemple #2
0
    def dataloader_hv_product(self, v):

        device = self.device
        num_data = 0  # count the number of datum points in the dataloader

        THv = [torch.zeros(p.size()).to(device) for p in self.params
              ]  # accumulate result
        for inputs, targets in self.data:
            self.model.zero_grad()
            tmp_num_data = inputs.size(0)
            outputs = self.model(inputs.to(device))
            loss = self.criterion(outputs, targets.to(device))
            loss.backward(create_graph=True)
            params, gradsH = get_params_grad(self.model)
            self.model.zero_grad()
            Hv = torch.autograd.grad(gradsH,
                                     params,
                                     grad_outputs=v,
                                     only_inputs=True,
                                     retain_graph=False)
            THv = [
                THv1 + Hv1 * float(tmp_num_data) + 0.
                for THv1, Hv1 in zip(THv, Hv)
            ]
            num_data += float(tmp_num_data)

        THv = [THv1 / float(num_data) for THv1 in THv]
        eigenvalue = group_product(THv, v).cpu().item()
        return eigenvalue, THv
Exemple #3
0
    def eigenvalues(self, maxIter=100, tol=1e-3, top_n=1):
        """
        compute the top_n eigenvalues using power iteration method
        maxIter: maximum iterations used to compute each single eigenvalue
        tol: the relative tolerance between two consecutive eigenvalue computations from power iteration
        top_n: top top_n eigenvalues will be computed
        """

        assert top_n >= 1

        device = self.device

        eigenvalues = []
        eigenvectors = []

        computed_dim = 0

        while computed_dim < top_n:
            eigenvalue = None
            v = [torch.randn(p.size()).to(device) for p in self.params
                ]  # generate random vector
            # print(v)
            v = normalization(v)  # normalize the vector

            for i in range(maxIter):
                v = orthnormal(v, eigenvectors)
                self.model.zero_grad()

                if self.full_dataset:
                    tmp_eigenvalue, Hv = self.dataloader_hv_product(v)
                else:
                    Hv = hessian_vector_product(self.gradsH, self.params, v)
                    tmp_eigenvalue = group_product(Hv, v).cpu().item()

                v = normalization(Hv)

                if eigenvalue == None:
                    eigenvalue = tmp_eigenvalue
                else:
                    if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) +
                                                           1e-6) < tol:
                        break
                    else:
                        eigenvalue = tmp_eigenvalue
            eigenvalues.append(eigenvalue)
            eigenvectors.append(v)
            computed_dim += 1

        return eigenvalues, eigenvectors
Exemple #4
0
    def trace(self, maxIter=100, tol=1e-3):
        """
        compute the trace of hessian using Hutchinson's method
        maxIter: maximum iterations used to compute trace
        tol: the relative tolerance
        """

        device = self.device
        trace_vhv = []
        trace = 0.

        for i in range(maxIter):
            self.model.zero_grad()
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]

            #print("Len of v is", len(v))
            # generate Rademacher random variables
            for gradH, param, v_i in zip(self.gradsH, self.params, v):
                v_i[v_i == 0] = -1
                #print("SHape Before", v_i.shape)
                mask = get_weight_mask(param.abs(),
                                       0 if self.drop is None else self.drop)
                v_i = v_i * mask
                gradH *= mask
            #print("Shape After",v_i.shape)
            #print("% zeros =" ,num_zero(v_i)/float(v_i.numel()))

            v2 = [vi.detach().clone() for vi in v]
            if self.full_dataset:
                _, Hv = self.dataloader_hv_product(v)
            else:
                Hv = hessian_vector_product(self.gradsH, self.params, v)
            #print("Shape of Hv is", len(Hv))
            gp = [g.cpu().item() for g in group_product(Hv, v2)]
            #or vi in v2:
            #rint("######################################")
            #print("% of zeros after in v", float(num_zero(vi))/float(vi.numel()))
            # print("Shape of Gp is",len(gp))
            trace_vhv.append(gp)

        # if abs(np.mean(trace_vhv) - trace) / (trace + 1e-6) < tol:
        #  return trace_vhv
        # else:
        # trace = np.mean(trace_vhv)

        return self.names, np.sum(trace_vhv, axis=0)
Exemple #5
0
    def sketch(self, d, debug=False):
        """
        Sketch function and scale down from a nxn matrix to a dxd matrix
        Do this by right multiplying by d nx1 column vectors to get a nxd matrix
            then left multiplying by a dxn matrix
        Sketch is made up of Rademacher variables (helps with trace calculation as well)
        Output is a dxd numpy array
        """

        device = self.device
        # Generate d Rademacher vectors v and calculate corresponding Hv
        print("starting")
        print("d = " + str(d))
        print(time.time())
        vs = []
        Hvs = []
        for i in range(d):
            print(i)
            # Generate Rademacher random variables
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]
            for v_i in v:
                v_i[v_i == 0] = -1
            # print(norm(v))
            v = normalization(v)
            # print(norm(v))
            # print(group_product(v, v).cpu().item())
            vs.append(v)
            # Calculate Hv
            self.model.zero_grad()
            if self.full_dataset:
                _, Hv = self.dataloader_hv_product(v)
            else:
                Hv = hessian_vector_product(self.gradsH, self.params, v)
            Hvs.append(Hv)
        # Create sketched matrix template
        print(time.time())
        sketched_hessian = np.zeros((d, d))
        # Fill in matrix as A_ij = v_i' * Hv_j
        for i in range(d):
            for j in range(d):
                # print("({}, {})".format(i, j))
                sketched_hessian[i, j] = group_product(vs[i],
                                                       Hvs[j]).cpu().item() / d
        print(time.time())
        return sketched_hessian
Exemple #6
0
 def dataloader_hv_product(self, v):
     device = self.device
     num_data = 0  # count the number of datum points in the dataloader
     THv = [torch.zeros(p.size()).to(device)
            for p in self.params]  # accumulate result
     for i, (inputs, targets) in enumerate(self.data):
         self.model.zero_grad()
         inputs = inputs.to(device)
         targets = targets.to(device)
         tmp_num_data = inputs.size(0)
         burnin = 50
         self.model.init(inputs, burnin=burnin)
         t_sample = inputs.shape[1]
         loss_mask = (targets.sum(2) > 0).unsqueeze(2).float().to(device)
         for t in (range(burnin, t_sample)):
             Sin_t = inputs[:, t]
             s, r, u = self.model.step(Sin_t)
             loss_ = self.criterion(s,
                                    r,
                                    u,
                                    target=targets[:, t, :],
                                    mask=loss_mask[:, t, :],
                                    sum_=False)
             sum(loss_).backward(create_graph=True)
             params, gradsH = get_params_grad(self.model)
             self.model.zero_grad()
             Hv = torch.autograd.grad(gradsH,
                                      params,
                                      grad_outputs=v,
                                      only_inputs=True,
                                      retain_graph=False)
             THv = [
                 THv1 + Hv1 * float(tmp_num_data) + 0.
                 for THv1, Hv1 in zip(THv, Hv)
             ]
         num_data += float(tmp_num_data)
     THv = [THv1 / float(num_data) for THv1 in THv]
     eigenvalue = group_product(THv, v).cpu().item()
     return eigenvalue, THv
Exemple #7
0
    def density(self, iter=100, n_v=1):
        """
        compute estimated eigenvalue density using stochastic lanczos algorithm (SLQ)
        iter: number of iterations used to compute trace
        n_v: number of SLQ runs
        """

        device = self.device
        eigen_list_full = []
        weight_list_full = []

        for k in range(n_v):
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]
            # generate Rademacher random variables
            for v_i in v:
                v_i[v_i == 0] = -1
            v = normalization(v)

            # standard lanczos algorithm initlization
            v_list = [v]
            w_list = []
            alpha_list = []
            beta_list = []
            ############### Lanczos
            for i in range(iter):
                self.model.zero_grad()
                w_prime = [torch.zeros(p.size()).to(device) for p in self.params]
                if i == 0:
                    if self.full_dataset:
                        _, w_prime = self.dataloader_hv_product(v)
                    else:
                        w_prime = hessian_vector_product(
                            self.gradsH, self.params, v)
                    alpha = group_product(w_prime, v)
                    alpha_list.append(alpha.cpu().item())
                    w = group_add(w_prime, v, alpha=-alpha)
                    w_list.append(w)
                else:
                    beta = torch.sqrt(group_product(w, w))
                    beta_list.append(beta.cpu().item())
                    if beta_list[-1] != 0.:
                        # We should re-orth it
                        v = orthnormal(w, v_list)
                        v_list.append(v)
                    else:
                        # generate a new vector
                        w = [torch.randn(p.size()).to(device) for p in self.params]
                        v = orthnormal(w, v_list)
                        v_list.append(v)
                    if self.full_dataset:
                        _, w_prime = self.dataloader_hv_product(v)
                    else:
                        w_prime = hessian_vector_product(
                            self.gradsH, self.params, v)
                    alpha = group_product(w_prime, v)
                    alpha_list.append(alpha.cpu().item())
                    w_tmp = group_add(w_prime, v, alpha=-alpha)
                    w = group_add(w_tmp, v_list[-2], alpha=-beta)

            T = torch.zeros(iter, iter).to(device)
            for i in range(len(alpha_list)):
                T[i, i] = alpha_list[i]
                if i < len(alpha_list) - 1:
                    T[i + 1, i] = beta_list[i]
                    T[i, i + 1] = beta_list[i]
            a_, b_ = torch.eig(T, eigenvectors=True)

            eigen_list = a_[:, 0]
            weight_list = b_[0, :]**2
            eigen_list_full.append(list(eigen_list.cpu().numpy()))
            weight_list_full.append(list(weight_list.cpu().numpy()))

        return eigen_list_full, weight_list_full
Exemple #8
0
    def trace_forced_lengthy(self, maxIter=150, num_reps=1, debug=False):
        """
        As a test, do not terminate trace calculation after 'convergence'
        Go a fixed number of iterations
        """

        device = self.device
        trace_vhv = []
        trace = 0.

        # Prepare to record data
        if self.record_data:
            now = datetime.datetime.now()
            timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format(
                now.day, now.month, now.hour, now.minute, now.second)
            save_file = self.data_save_dir + "Trace" + timestamp + ".txt"
            total_time_to_compute = []
            trace_estimate = []

        start_time = time.time()
        for i in range(maxIter):
            if debug:
                print("Iteration {}".format(i))
            self.model.zero_grad()
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]
            # generate Rademacher random variables
            for v_i in v:
                v_i[v_i == 0] = -1

            if self.full_dataset:
                _, Hv = self.dataloader_hv_product(v)
            else:
                Hv = hessian_vector_product(self.gradsH, self.params, v)
            trace_vhv.append(group_product(Hv, v).cpu().item())

            total_time_to_compute.append(time.time() - start_time)
            trace_estimate.append(np.mean(trace_vhv))

            if abs(np.mean(trace_vhv) - trace) / (trace + 1e-6) < tol:
                # Write data if applicable
                if self.record_data:
                    with open(save_file, 'w') as f:
                        f.write(
                            "Iteration\tTotal Elapsed Time(s)\tTrace Estimate\n"
                        )
                        for i in range(len(total_time_to_compute)):
                            f.write("{}\t{}\t{}\n".format(
                                i + 1, total_time_to_compute[i],
                                trace_estimate[i]))
                return trace_vhv
            else:
                trace = np.mean(trace_vhv)
        # Trace could not converge
        # Write data if applicable
        if self.record_data:
            with open(save_file, 'w') as f:
                f.write("Iteration\tTotal Elapsed Time(s)\tTrace Estimate\n")
                for i in range(len(total_time_to_compute)):
                    f.write("{}\t{}\t{}\n".format(i + 1,
                                                  total_time_to_compute[i],
                                                  trace_estimate[i]))
        return trace_vhv
Exemple #9
0
    def eigenvalues_lanczos(self, k, debug=False):
        """
        Compute the top k eigenvalues by Lacnzos Method for approximating eigenvalues
        """

        device = self.device

        # Prepare to record data
        if self.record_data:
            now = datetime.datetime.now()
            timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format(
                now.day, now.month, now.hour, now.minute, now.second)
            save_file = self.data_save_dir + "Lanczos" + timestamp + ".txt"
            total_time_to_compute = []

        start_time = time.time()
        # Pick a random first vector, making sure it has norm 1
        print("starting with q1")
        q0 = [torch.randn(p.size()).to(device) for p in self.params]
        q0 = normalization(q0)
        total_time_to_compute.append(time.time() - start_time)
        # Calculate Hq1
        self.model.zero_grad()
        if self.full_dataset:
            _, Hq0 = self.dataloader_hv_product(q0)
        else:
            Hq0 = hessian_vector_product(self.gradsH, self.params, q0)
        # First column
        qs = [q0]
        Hqs = [Hq0]
        T = np.zeros((k + 1, k))
        T[0, 0] = group_product(qs[0], Hqs[0]).cpu().item()
        r = multi_add([Hqs[0], qs[0]], [1, -1 * T[0, 0]])  # r = Hq0 - T00*q0
        T[1, 0] = norm(r)  # T10 = |r|
        T[0, 1] = T[1, 0]  # T symmetric
        q1 = [ri / T[1, 0] for ri in r]  #q2 = r/|r|
        total_time_to_compute.append(time.time() - start_time)
        # Calculate Hq2
        self.model.zero_grad()
        if self.full_dataset:
            _, Hq1 = self.dataloader_hv_product(q1)
        else:
            Hq1 = hessian_vector_product(self.gradsH, self.params, q1)
        qs.append(q1)
        Hqs.append(Hq1)
        # Subsequent columns (columns 1 - k-1)
        for i in range(1, k):
            print(i)
            T[i, i] = group_product(qs[i], Hqs[i]).cpu().item()
            r = multi_add([Hqs[i], qs[i - 1], qs[i]],
                          [1, -1 * T[i - 1, i], -1 * T[i, i]])
            T[i + 1, i] = norm(r)
            if i != k - 1:
                T[i, i + 1] = T[i + 1, i]
            q = [ri / T[i + 1, i] for ri in r]
            total_time_to_compute.append(time.time() - start_time)
            self.model.zero_grad()
            if self.full_dataset:
                _, Hq = self.dataloader_hv_product(q)
            else:
                Hq = hessian_vector_product(self.gradsH, self.params, q)
            qs.append(q)
            Hqs.append(Hq)
        # print(T)
        T_UH = T[0:k, 0:k]  #T_UH is square Upper Hessenberg
        # np.save("T_100", T)
        # print(T_UH)
        # print(np.linalg.eigvalsh(T_UH))

        # Write data if applicable
        if self.record_data:
            with open(save_file, 'w') as f:
                f.write("Total Elapsed Time(s)\tEigenvalues\n")
                for i in range(k):
                    eigs_i = np.linalg.eigvalsh(T[0:i, 0:i])
                    s = ""
                    for e in eigs_i:
                        s += "\t" + str(e)
                    s = str(total_time_to_compute[i]) + s + "\n"
                    f.write(s)
        return np.linalg.eigvalsh(T_UH)
Exemple #10
0
    def density(self, iter=100, n_v=1, debug=False):
        """
        compute estimated eigenvalue density using stochastic lanczos algorithm (SLQ)
        iter: number of iterations used to compute trace
        n_v: number of SLQ runs

        """

        device = self.device
        eigen_list_full = []
        weight_list_full = []

        # Prepare to record data
        if self.record_data:
            now = datetime.datetime.now()
            timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format(
                now.day, now.month, now.hour, now.minute, now.second)
            save_file = self.data_save_dir + "ESD" + timestamp + ".txt"

        start_time = time.time()
        for k in range(n_v):
            v = [
                torch.randint_like(p, high=2, device=device)
                for p in self.params
            ]
            # generate Rademacher random variables
            for v_i in v:
                v_i[v_i == 0] = -1
            v = normalization(v)

            # standard lanczos algorithm initlization
            v_list = [v]
            w_list = []
            alpha_list = []
            beta_list = []
            ############### Lanczos
            for i in range(iter):
                if debug:
                    print("Iteration {}".format(i))
                self.model.zero_grad()
                w_prime = [
                    torch.zeros(p.size()).to(device) for p in self.params
                ]
                if i == 0:
                    if self.full_dataset:
                        _, w_prime = self.dataloader_hv_product(v)
                    else:
                        w_prime = hessian_vector_product(
                            self.gradsH, self.params, v)
                    alpha = group_product(w_prime, v)
                    alpha_list.append(alpha.cpu().item())
                    w = group_add(w_prime, v, alpha=-alpha)
                    w_list.append(w)
                else:
                    beta = torch.sqrt(group_product(w, w))
                    beta_list.append(beta.cpu().item())
                    if beta_list[-1] != 0.:
                        # We should re-orth it
                        v = orthnormal(w, v_list)
                        v_list.append(v)
                    else:
                        # generate a new vector
                        w = [
                            torch.randn(p.size()).to(device)
                            for p in self.params
                        ]
                        v = orthnormal(w, v_list)
                        v_list.append(v)
                    if self.full_dataset:
                        _, w_prime = self.dataloader_hv_product(v)
                    else:
                        w_prime = hessian_vector_product(
                            self.gradsH, self.params, v)
                    alpha = group_product(w_prime, v)
                    alpha_list.append(alpha.cpu().item())
                    w_tmp = group_add(w_prime, v, alpha=-alpha)
                    w = group_add(w_tmp, v_list[-2], alpha=-beta)

            T = torch.zeros(iter, iter).to(device)
            for i in range(len(alpha_list)):
                T[i, i] = alpha_list[i]
                if i < len(alpha_list) - 1:
                    T[i + 1, i] = beta_list[i]
                    T[i, i + 1] = beta_list[i]
            a_, b_ = torch.eig(T, eigenvectors=True)

            eigen_list = a_[:, 0]
            weight_list = b_[0, :]**2
            eigen_list_full.append(list(eigen_list.cpu().numpy()))
            weight_list_full.append(list(weight_list.cpu().numpy()))
        # Write data if applicable
        stop_time = time.time()
        if self.record_data:
            with open(save_file, 'w') as f:
                f.write("Total Elapsed Time(s)\n")
                f.write("{}\n".format(stop_time - start_time))
        return eigen_list_full, weight_list_full
Exemple #11
0
    def eigenvalues(self, maxIter=100, tol=1e-3, top_n=1, debug=False):
        """
        compute the top_n eigenvalues using power iteration method
        maxIter: maximum iterations used to compute each single eigenvalue
        tol: the relative tolerance between two consecutive eigenvalue computations from power iteration
        top_n: top top_n eigenvalues will be computed
        """

        assert top_n >= 1

        device = self.device

        eigenvalues = []
        eigenvectors = []

        computed_dim = 0

        # Prepare to record data
        if self.record_data:
            now = datetime.datetime.now()
            timestamp = "_{:02d}{:02d}_{:02d}{:02d}{:02d}".format(
                now.day, now.month, now.hour, now.minute, now.second)
            save_file = self.data_save_dir + "TopEigen" + timestamp + ".txt"
            total_time_to_compute = []
            iters_to_compute = []

        start_time = time.time()
        while computed_dim < top_n:
            if debug:
                print("Computing eigenvalue #{}".format(computed_dim + 1))
            eigenvalue = None
            v = [torch.randn(p.size()).to(device)
                 for p in self.params]  # generate random vector
            v = normalization(v)  # normalize the vector

            for i in range(maxIter):
                if debug:
                    print("   Iteration {}".format(i))
                v = orthnormal(v, eigenvectors)
                self.model.zero_grad()

                if self.full_dataset:
                    tmp_eigenvalue, Hv = self.dataloader_hv_product(v)
                else:
                    Hv = hessian_vector_product(self.gradsH, self.params, v)
                    tmp_eigenvalue = group_product(Hv, v).cpu().item()

                v = normalization(Hv)

                if eigenvalue == None:
                    eigenvalue = tmp_eigenvalue
                else:
                    if abs(eigenvalue - tmp_eigenvalue) / (abs(eigenvalue) +
                                                           1e-6) < tol:
                        break
                    else:
                        eigenvalue = tmp_eigenvalue
            # Record data
            total_time_to_compute.append(time.time() - start_time)
            iters_to_compute.append(i)
            eigenvalues.append(eigenvalue)
            eigenvectors.append(v)
            computed_dim += 1
        # Write data if applicable
        if self.record_data:
            with open(save_file, 'w') as f:
                f.write("Eigenvalue\tTotal Elapsed Time(s)\t#Iterations\n")
                for i in range(top_n):
                    f.write("{}\t{}\t{}\n".format(i + 1,
                                                  total_time_to_compute[i],
                                                  iters_to_compute[i]))
        return eigenvalues, eigenvectors