Beispiel #1
0
    def __init__(self, n_agent, n_hidden=64, shuffle=True, **kwargs):

        # Load data
        self.X_train, self.Y_train, self.X_test, self.Y_test = MNIST().load()

        self.X_train = np.append(self.X_train, np.ones((self.X_train.shape[0], 1)), axis=1)
        self.X_test = np.append(self.X_test, np.ones((self.X_test.shape[0], 1)), axis=1)

        self.n_hidden = n_hidden  # Number of neurons in hidden layer
        self.m = int(self.X_train.shape[0] / n_agent)
        self.n_class = self.Y_train.shape[1]
        self.img_dim = self.X_train.shape[1]

        log.info(self.img_dim)
        log.info(self.n_class)
        # Shuffle
        if shuffle is True:
            idx = np.random.permutation(len(self.X_train))
            self.X_train, self.Y_train = self.X_train[idx], self.Y_train[idx]

        super().__init__(n_agent, self.m, (n_hidden + 1) * (self.img_dim + self.n_class), **kwargs)

        # Split training data into n agents
        self.X = self.split_data(self.X_train)
        self.Y = self.split_data(self.Y_train)
        self.Y_train_labels = self.Y_train.argmax(axis=1)
        self.Y_test_labels = self.Y_test.argmax(axis=1)

        # Internal buffers
        self._dw = np.zeros(self.dim)
        self._dw1, self._dw2 = self.unpack_w(self._dw)  # Reference to the internal buffer
Beispiel #2
0
    def _init(self, result_queue=None):

        if xp.__name__ == 'cupy':
            self.cuda()

        if self.is_smooth is True:
            x_min = xp.linalg.solve(
                self.X_train.T.dot(self.X_train) +
                2 * self.m_total * self.r * xp.eye(self.dim),
                self.X_train.T.dot(self.Y_train))

        else:
            from nda.optimizers.utils import FISTA
            x_min, _ = FISTA(self.grad_h,
                             xp.random.randn(self.dim),
                             self.L,
                             self.r,
                             n_iters=100000)

        f_min = self.f(x_min)
        log.info(f'f_min = {f_min}')

        if xp.__name__ == 'cupy':
            f_min = f_min.item()
            x_min = x_min.get()

        if result_queue is not None:
            result_queue.put(x_min)
            result_queue.put(f_min)

        return x_min, f_min
Beispiel #3
0
 def __init__(self, p, eta=0.1, **kwargs):
     super().__init__(p, is_distributed=False, **kwargs)
     self.eta = eta
     if self.p.is_smooth is False:
         log.info('Nonsmooth problem, running sub-gradient descent instead')
         self.update = self.subgd_update
         self.name = 'SubGD'
def multi_process_helper(device_id, task_id, opt, res_queue):
    start = time.time()
    log.info(f'{opt.get_name()} started')
    log.debug(f'task {task_id} started on device {device_id}')
    np.random.seed(task_id)
    random.seed(task_id)

    try:
        import cupy as cp
        cp.cuda.Device(device=device_id).use()
        cp.random.seed(task_id)
        opt.cuda()
        opt.optimize()
        columns, metrics = opt.get_metrics()
        name = opt.get_name()
    except ModuleNotFoundError:
        opt.optimize()
        columns, metrics = opt.get_metrics()
        name = opt.get_name()

    end = time.time()
    log.info('%s done, total %.2fs', name, end - start)
    log.debug(f'task {task_id} on device {device_id} exited')

    res_queue.put([task_id, name, pd.DataFrame(metrics, columns=columns)])
Beispiel #5
0
def multi_process_helper(opt):
    start = time.time()
    log.info('%s started', opt.get_name())
    opt.optimize()
    end = time.time()
    log.info('%s done, total %.2fs', opt.get_name(), end - start)
    return opt
Beispiel #6
0
    def __init__(self,
                 n_agent,
                 m,
                 dim,
                 noise_variance=0.1,
                 kappa=10,
                 **kwargs):

        super().__init__(n_agent, m, dim, **kwargs)

        self.noise_variance = noise_variance
        self.kappa = kappa

        # Generate X
        self.X_total, self.L, self.sigma, self.S = self.generate_x(
            self.m_total, self.dim, self.kappa)

        # Generate Y and the optimal solution
        self.x_0 = self.w_0 = np.random.rand(self.dim)
        self.Y_0_total = self.X_total.dot(self.w_0)
        self.Y_total = self.Y_0_total + np.sqrt(
            self.noise_variance) * np.random.randn(self.m_total)

        # Split data
        self.X = self.split_data(self.X_total)
        self.Y = self.split_data(self.Y_total)

        # Pre-calculate matrix products to accelerate gradient and function value evaluations
        self.H = self.X_total.T.dot(self.X_total) / self.m_total
        self.H_list = np.einsum('ikj,ikl->ijl', self.X, self.X) / self.m

        self.X_T_Y = self.X_total.T.dot(self.Y_total) / self.m_total
        self.X_T_Y_list = np.einsum('ikj,ik->ij', self.X, self.Y) / self.m

        if self.is_smooth is True:
            self.x_min = self.w_min = np.linalg.solve(
                self.X_total.T.dot(self.X_total) +
                2 * self.m_total * self.r * np.eye(self.dim),
                self.X_total.T.dot(self.Y_total))
        else:
            import sys
            sys.path.append("..")
            from optimizers.utils import FISTA
            self.x_min, _ = FISTA(self.grad_h,
                                  np.random.randn(self.dim),
                                  self.L,
                                  self.r,
                                  n_iters=100000)
            self.w_min = self.x_min

        self.f_min = self.f(self.x_min)

        log.info(
            'beta = %.4f',
            np.linalg.norm(self.H_list - self.H, ord=2, axis=(1, 2)).max())
Beispiel #7
0
    def check_stopping_conditions(self):
        '''Check stopping conditions'''

        if self.x.ndim > 1:
            x = self.x.mean(axis=1)
        else:
            x = self.x

        grad_norm = norm(self.p.grad(x))
        if grad_norm < self.grad_eps:
            log.info('Gradient norm converged')
            return True
        elif grad_norm > 100:
            log.info('Gradient norm diverged')
            return True

        if self.p.x_min is not None:
            distance = norm(x - self.p.x_min) / norm(self.p.x_min)
            if distance < self.var_eps:
                log.info('Variable converged')
                return True

            if distance > self.p.dim:
                log.info('Variable diverged')
                return True

        return False
Beispiel #8
0
    def __init__(self, noise_variance=0.1, kappa=10, **kwargs):

        self.noise_variance = noise_variance
        self.kappa = kappa

        super().__init__(**kwargs)

        # Pre-calculate matrix products to accelerate gradient and function value evaluations
        self.H = self.X_train.T.dot(self.X_train) / self.m_total
        self.X_T_Y = self.X_train.T.dot(self.Y_train) / self.m_total

        if xp.__name__ == 'cupy':
            log.info('Initializing using GPU')
            q = mp.Queue(2)
            pp = mp.Process(target=self._init, args=(q, ))
            pp.start()
            pp.join()
            self.x_min = self.w_min = q.get()
            self.f_min = q.get()
        else:
            log.info('Initializing using CPU')
            self.x_min, self.f_min = self._init()

        # Pre-calculate matrix products to accelerate gradient and function value evaluations
        # After computing minimum to reduce memory copy
        self.H_list = np.einsum('ikj,ikl->ijl', self.X, self.X) / self.m
        self.X_T_Y_list = np.einsum('ikj,ik->ij', self.X, self.Y) / self.m
        log.info(
            'beta = %.4f',
            np.linalg.norm(self.H_list - self.H, ord=2, axis=(1, 2)).max())
        log.info('Initialization done')
Beispiel #9
0
    def grad_check(self):
        '''Check whether the full gradient equals to the gradient computed by finite difference at a random point.'''
        w = np.random.randn(self.dim)
        delta = np.zeros(self.dim)
        grad = np.zeros(self.dim)
        eps = 1e-4

        for i in range(self.dim):
            delta[i] = eps
            grad[i] = (self.f(w + delta) - self.f(w - delta)) / 2 / eps
            delta[i] = 0

        if np.linalg.norm(grad - self.grad(w)) > eps:
            log.warn('Gradient implementation check failed!')
            return False
        else:
            log.info('Gradient implementation check succeeded!')
            return True
Beispiel #10
0
    def __init__(self, n_hidden=64, dataset='mnist', **kwargs):

        super().__init__(dataset=dataset, **kwargs)

        self.n_hidden = n_hidden  # Number of neurons in hidden layer
        self.n_class = self.Y_train.shape[1]
        self.img_dim = self.X_train.shape[1]
        self.dim = (n_hidden + 1) * (self.img_dim + self.n_class)

        self.Y_train_labels = self.Y_train.argmax(axis=1)
        self.Y_test_labels = self.Y_test.argmax(axis=1)

        # Internal buffers
        self._dw = np.zeros(self.dim)
        self._dw1, self._dw2 = self.unpack_w(
            self._dw)  # Reference to the internal buffer

        log.info('Initialization done')
Beispiel #11
0
    def _init(self, result_queue=None):

        if xp.__name__ == 'cupy':
            self.cuda()

        log.info('Computing norm')
        norm = xp.linalg.norm(self.X_train, 2) / (
            2 * xp.sqrt(self.m_total))  # Upper bound of the hessian
        self.X_train /= norm
        self.X /= norm

        if self.kappa is not None:
            log.info('Computing min')
            x_min, count = NAG(self.grad,
                               xp.random.randn(self.dim),
                               self.L,
                               self.sigma,
                               n_iters=5000,
                               eps=1e-10)
            log.info(f'NAG ran for {count} iterations')
            f_min = self.f(x_min)
            log.info(f'f_min = {f_min}')
            log.info(f'grad_f(x_min) = {xp.linalg.norm(self.grad(x_min))}')

        if xp.__name__ == 'cupy':
            norm = norm.item()
            if self.kappa is not None:
                x_min = x_min.get()
                f_min = f_min.item()
            else:
                x_min = f_min = None

        if result_queue is not None:
            result_queue.put(norm)
            if self.kappa is not None:
                result_queue.put(x_min)
                result_queue.put(f_min)

        if self.kappa is not None:
            return norm, x_min, f_min

        return norm, None, None
Beispiel #12
0
    def load(self):
        if not os.path.exists(self.cache_path):
            log.info('Downloading %s dataset' % self.name)
            os.system('mkdir -p %s' % self.data_dir)
            self.download()
            data = self.load_raw()
            np.savez_compressed(self.cache_path,
                                X_train=data[0],
                                Y_train=data[1],
                                X_test=data[2],
                                Y_test=data[3])
            return data

        else:
            log.info('Loading %s dataset from cached file' % self.name)
            data = np.load(self.cache_path, allow_pickle=True)
            return [
                data[key]
                for key in ['X_train', 'Y_train', 'X_test', 'Y_test']
            ]
Beispiel #13
0
    def load(self):
        if not os.path.exists(self.cache_path):
            log.info('Downloading %s dataset' % self.name)
            os.system('mkdir -p %s' % self.data_dir)
            self.download()
            self.load_raw()
            np.savez_compressed(self.cache_path,
                                X_train=self.X_train,
                                Y_train=self.Y_train,
                                X_test=self.X_test,
                                Y_test=self.Y_test)

        else:
            log.info('Loading %s dataset from cached file' % self.name)
            data = np.load(self.cache_path, allow_pickle=True)
            self.X_train = data['X_train']
            self.Y_train = data['Y_train']
            self.X_test = data['X_test']
            self.Y_test = data['Y_test']

        if self.normalize:
            self.normalize_data()

        return self.X_train, self.Y_train, self.X_test, self.Y_test
Beispiel #14
0
    def __init__(self,
                 kappa=None,
                 noise_ratio=None,
                 LAMBDA=0,
                 alpha=0,
                 **kwargs):

        self.noise_ratio = noise_ratio
        self.kappa = kappa
        self.alpha = alpha
        self.LAMBDA = LAMBDA

        super().__init__(**kwargs)

        if alpha == 0:
            if kappa == 1:
                self.LAMBDA = 100
            elif kappa is not None:
                self.LAMBDA = 1 / (self.kappa - 1)
            self.L = 1 + self.LAMBDA
            self.sigma = self.LAMBDA if self.LAMBDA != 0 else None
        else:
            self.L = 1 + self.LAMBDA + 6 * self.alpha
            self.sigma = self.LAMBDA + 2 * self.alpha

        if xp.__name__ == 'cupy':
            log.info('Initializing using GPU')
            q = mp.Queue(3)
            pp = mp.Process(target=self._init, args=(q, ))
            pp.start()
            pp.join()
            norm = q.get()
            if self.kappa is not None:
                self.x_min = self.w_min = q.get()
                self.f_min = q.get()
        else:
            log.info('Initializing using CPU')
            norm, self.x_min, self.f_min = self._init()

        self.X_train /= norm
        self.X_test /= norm

        log.info('Initialization done')
Beispiel #15
0
    # Experiment 1: Gisette classification
    p = LogisticRegression(n_agent, graph_type='er', graph_params=0.3, dataset='gisette', alpha=0.001)
    dim = p.dim

    os.system('mkdir data figs')
    if os.path.exists('data/gisette_initialization.npz'):
        x_0 = np.load('data/gisette_initialization.npz').get('x_0')
    else:
        x_0 = np.random.rand(dim, n_agent)
        np.savez('data/gisette_initialization.npz', x_0=x_0)
    x_0_mean = x_0.mean(axis=1)

    # Experiment 1.1: er topology
    W, alpha = generate_mixing_matrix(p)
    log.info('alpha = %.4f', alpha)

    exps = [
        DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True),
        DESTRESS(p, n_iters=300, n_inner_iters=10, eta=1, K_in=2, K_out=2, batch_size=10, x_0=x_0, W=W),
        GT_SARAH(p, n_iters=300, n_inner_iters=10, batch_size=10, eta=0.1, x_0=x_0, W=W),
    ]

    begin = time.time()
    exps = run_exp(exps, name='gisette-er', n_process=1, plot=False, save=True)
    end = time.time()
    log.info('Total %.2fs', end - begin)

    plot_gisette_exp(exps, 'er', p.m_total)

    # Experiment 1.2: grid topology
Beispiel #16
0
    dim = 40

    kappa = 10
    mu = 5e-10
    n_iters = 10

    p = LinearRegression(n_agent=n_agent,
                         m=m,
                         dim=dim,
                         noise_variance=1,
                         kappa=kappa,
                         graph_type='er',
                         graph_params=0.3)
    W, alpha = generate_mixing_matrix(p)

    log.info('m = %d, n = %d, alpha = %.4f' % (m, n_agent, alpha))

    x_0 = np.random.rand(dim, n_agent)
    x_0_mean = x_0.mean(axis=1)

    eta_2 = 2 / (p.L + p.sigma)
    eta_1 = 1 / p.L

    n_inner_iters = 100
    n_sarah_iters = n_iters * 20
    n_dgd_iters = n_iters * 20
    batch_size = int(m / 100)
    n_dsgd_iters = int(n_iters * m / batch_size)

    centralized = [
        GD(p, n_iters=n_iters, eta=eta_2, x_0=x_0_mean),
Beispiel #17
0
if __name__ == '__main__':

    import matplotlib.pyplot as plt

    n = 10
    m = 1000
    dim = 10
    noise_variance = 0.01

    p = LinearRegression(n,
                         m,
                         dim,
                         noise_variance=noise_variance,
                         n_edges=4 * n,
                         balanced=False)
    log.info(p.m)
    p.grad_check()
    p.distributed_check()

    # p = LinearRegression(n, m, dim, noise_variance=noise_variance, n_edges=4*n)
    p.plot_graph()

    log.info('w_min = ' + str(p.w_min))
    log.info('f(w_min) = ' + str(p.f(p.w_min)))
    log.info('f_0(w_min) = ' + str(p.f(p.w_min, 0)))
    log.info('|| g(w_min) || = ' + str(np.linalg.norm(p.grad(p.w_min))))
    log.info('|| g_0(w_min) || = ' + str(np.linalg.norm(p.grad(p.w_min, 0))))

    plt.show()
Beispiel #18
0
    def distributed_check(self):
        '''Check the distributed function and gradient implementations are correct.'''
        def _check_1d_gradient():

            w = xp.random.randn(self.dim)
            g = self.grad(w)
            g_i = g_ij = 0
            res = True

            for i in range(self.n_agent):
                _tmp_g_i = self.grad(w, i)
                _tmp_g_ij = 0
                for j in range(self.m):
                    _tmp_g_ij += self.grad(w, i, j)

                if xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m) > 1e-5:
                    log.warn(
                        'Distributed graident check failed! Difference between local graident at agent %d and average of all local sample gradients is %.4f'
                        % (i, xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m)))
                    res = False

                g_i += _tmp_g_i
                g_ij += _tmp_g_ij

            g_i /= self.n_agent
            g_ij /= self.m_total

            if xp.linalg.norm(g - g_i) > 1e-5:
                log.warn(
                    'Distributed gradient check failed! Difference between global graident and average of local gradients is %.4f',
                    xp.linalg.norm(g - g_i))
                res = False

            if xp.linalg.norm(g - g_ij) > 1e-5:
                log.warn(
                    'Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f'
                    % xp.linalg.norm(g - g_ij))
                res = False

            return res

        def _check_2d_gradient():

            res = True
            w_2d = xp.random.randn(self.dim, self.n_agent)

            g_1d = 0
            for i in range(self.n_agent):
                g_1d += self.grad(w_2d[:, i], i=i)

            g_1d /= self.n_agent
            g_2d = self.grad(w_2d).mean(axis=1)

            if xp.linalg.norm(g_1d - g_2d) > 1e-5:
                log.warn(
                    'Distributed graident check failed! Difference between global gradient and average of distributed graidents is %.4f'
                    % xp.linalg.norm(g_1d - g_2d))
                res = False

            g_2d_sample = self.grad(w_2d,
                                    j=xp.arange(self.m).reshape(-1, 1).repeat(
                                        self.n_agent, axis=1).T).mean(axis=1)

            if xp.linalg.norm(g_1d - g_2d_sample) > 1e-5:
                log.warn(
                    'Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f'
                    % xp.linalg.norm(g_1d - g_2d_sample))
                res = False

            samples = xp.random.randint(0, self.m, (self.n_agent, 10))
            g_2d_stochastic = self.grad(w_2d, j=samples)
            for i in range(self.n_agent):
                g_1d_stochastic = self.grad(w_2d[:, i], i=i, j=samples[i])
                if xp.linalg.norm(g_1d_stochastic -
                                  g_2d_stochastic[:, i]) > 1e-5:
                    log.warn(
                        'Distributed graident check failed! Difference between distributed stoachastic gradient at agent %d and average of sample gradients is %.4f'
                        % (i,
                           xp.linalg.norm(g_1d_stochastic -
                                          g_2d_stochastic[:, i])))
                    res = False

            return res

        def _check_function_value():
            w = xp.random.randn(self.dim)
            f = self.f(w)
            f_i = f_ij = 0
            res = True

            for i in range(self.n_agent):
                _tmp_f_i = self.f(w, i)
                _tmp_f_ij = 0
                for j in range(self.m):
                    _tmp_f_ij += self.f(w, i, j)

                if xp.abs(_tmp_f_i - _tmp_f_ij / self.m) > 1e-10:
                    log.warn(
                        'Distributed function value check failed! Difference between local function value at agent %d and average of all local sample function values %d is %.4f'
                        % (i, i, xp.abs(_tmp_f_i - _tmp_f_ij / self.m)))
                    res = False

                f_i += _tmp_f_i
                f_ij += _tmp_f_ij

            f_i /= self.n_agent
            f_ij /= self.m_total

            if xp.abs(f - f_i) > 1e-10:
                log.warn(
                    'Distributed function value check failed! Difference between the global function value and average of local function values is %.4f'
                    % xp.abs(f - f_i))
                res = False

            if xp.abs(f - f_ij) > 1e-10:
                log.warn(
                    'Distributed function value check failed! Difference between the global function value and average of all sample function values is %.4f'
                    % xp.abs(f - f_ij))
                res = False

            return res

        res = _check_function_value() & _check_1d_gradient(
        ) & _check_2d_gradient()
        if res:
            log.info('Distributed check succeeded!')
            return True
        else:
            return False
Beispiel #19
0
    n_agent = 20
    n_iters = 20

    p = LogisticRegression(n_agent,
                           graph_type='er',
                           graph_params=0.3,
                           dataset='gisette',
                           alpha=0.001)

    x_0 = np.random.rand(p.dim, n_agent)
    x_0_mean = x_0.mean(axis=1)

    batch_size = int(np.sqrt(p.m))
    n_inner_iters = 10

    W, alpha = generate_mixing_matrix(p)
    log.info('alpha = %.4f', alpha)

    exps = [
        GD(p, n_iters=n_iters, eta=100, x_0=x_0, W=W),
    ]

    exps = run_exp(exps,
                   max_iter=n_iters,
                   name='gisette',
                   n_process=2,
                   plot=True,
                   save=False)
    plt.show()