def __init__(self, n_agent, n_hidden=64, shuffle=True, **kwargs): # Load data self.X_train, self.Y_train, self.X_test, self.Y_test = MNIST().load() self.X_train = np.append(self.X_train, np.ones((self.X_train.shape[0], 1)), axis=1) self.X_test = np.append(self.X_test, np.ones((self.X_test.shape[0], 1)), axis=1) self.n_hidden = n_hidden # Number of neurons in hidden layer self.m = int(self.X_train.shape[0] / n_agent) self.n_class = self.Y_train.shape[1] self.img_dim = self.X_train.shape[1] log.info(self.img_dim) log.info(self.n_class) # Shuffle if shuffle is True: idx = np.random.permutation(len(self.X_train)) self.X_train, self.Y_train = self.X_train[idx], self.Y_train[idx] super().__init__(n_agent, self.m, (n_hidden + 1) * (self.img_dim + self.n_class), **kwargs) # Split training data into n agents self.X = self.split_data(self.X_train) self.Y = self.split_data(self.Y_train) self.Y_train_labels = self.Y_train.argmax(axis=1) self.Y_test_labels = self.Y_test.argmax(axis=1) # Internal buffers self._dw = np.zeros(self.dim) self._dw1, self._dw2 = self.unpack_w(self._dw) # Reference to the internal buffer
def _init(self, result_queue=None): if xp.__name__ == 'cupy': self.cuda() if self.is_smooth is True: x_min = xp.linalg.solve( self.X_train.T.dot(self.X_train) + 2 * self.m_total * self.r * xp.eye(self.dim), self.X_train.T.dot(self.Y_train)) else: from nda.optimizers.utils import FISTA x_min, _ = FISTA(self.grad_h, xp.random.randn(self.dim), self.L, self.r, n_iters=100000) f_min = self.f(x_min) log.info(f'f_min = {f_min}') if xp.__name__ == 'cupy': f_min = f_min.item() x_min = x_min.get() if result_queue is not None: result_queue.put(x_min) result_queue.put(f_min) return x_min, f_min
def __init__(self, p, eta=0.1, **kwargs): super().__init__(p, is_distributed=False, **kwargs) self.eta = eta if self.p.is_smooth is False: log.info('Nonsmooth problem, running sub-gradient descent instead') self.update = self.subgd_update self.name = 'SubGD'
def multi_process_helper(device_id, task_id, opt, res_queue): start = time.time() log.info(f'{opt.get_name()} started') log.debug(f'task {task_id} started on device {device_id}') np.random.seed(task_id) random.seed(task_id) try: import cupy as cp cp.cuda.Device(device=device_id).use() cp.random.seed(task_id) opt.cuda() opt.optimize() columns, metrics = opt.get_metrics() name = opt.get_name() except ModuleNotFoundError: opt.optimize() columns, metrics = opt.get_metrics() name = opt.get_name() end = time.time() log.info('%s done, total %.2fs', name, end - start) log.debug(f'task {task_id} on device {device_id} exited') res_queue.put([task_id, name, pd.DataFrame(metrics, columns=columns)])
def multi_process_helper(opt): start = time.time() log.info('%s started', opt.get_name()) opt.optimize() end = time.time() log.info('%s done, total %.2fs', opt.get_name(), end - start) return opt
def __init__(self, n_agent, m, dim, noise_variance=0.1, kappa=10, **kwargs): super().__init__(n_agent, m, dim, **kwargs) self.noise_variance = noise_variance self.kappa = kappa # Generate X self.X_total, self.L, self.sigma, self.S = self.generate_x( self.m_total, self.dim, self.kappa) # Generate Y and the optimal solution self.x_0 = self.w_0 = np.random.rand(self.dim) self.Y_0_total = self.X_total.dot(self.w_0) self.Y_total = self.Y_0_total + np.sqrt( self.noise_variance) * np.random.randn(self.m_total) # Split data self.X = self.split_data(self.X_total) self.Y = self.split_data(self.Y_total) # Pre-calculate matrix products to accelerate gradient and function value evaluations self.H = self.X_total.T.dot(self.X_total) / self.m_total self.H_list = np.einsum('ikj,ikl->ijl', self.X, self.X) / self.m self.X_T_Y = self.X_total.T.dot(self.Y_total) / self.m_total self.X_T_Y_list = np.einsum('ikj,ik->ij', self.X, self.Y) / self.m if self.is_smooth is True: self.x_min = self.w_min = np.linalg.solve( self.X_total.T.dot(self.X_total) + 2 * self.m_total * self.r * np.eye(self.dim), self.X_total.T.dot(self.Y_total)) else: import sys sys.path.append("..") from optimizers.utils import FISTA self.x_min, _ = FISTA(self.grad_h, np.random.randn(self.dim), self.L, self.r, n_iters=100000) self.w_min = self.x_min self.f_min = self.f(self.x_min) log.info( 'beta = %.4f', np.linalg.norm(self.H_list - self.H, ord=2, axis=(1, 2)).max())
def check_stopping_conditions(self): '''Check stopping conditions''' if self.x.ndim > 1: x = self.x.mean(axis=1) else: x = self.x grad_norm = norm(self.p.grad(x)) if grad_norm < self.grad_eps: log.info('Gradient norm converged') return True elif grad_norm > 100: log.info('Gradient norm diverged') return True if self.p.x_min is not None: distance = norm(x - self.p.x_min) / norm(self.p.x_min) if distance < self.var_eps: log.info('Variable converged') return True if distance > self.p.dim: log.info('Variable diverged') return True return False
def __init__(self, noise_variance=0.1, kappa=10, **kwargs): self.noise_variance = noise_variance self.kappa = kappa super().__init__(**kwargs) # Pre-calculate matrix products to accelerate gradient and function value evaluations self.H = self.X_train.T.dot(self.X_train) / self.m_total self.X_T_Y = self.X_train.T.dot(self.Y_train) / self.m_total if xp.__name__ == 'cupy': log.info('Initializing using GPU') q = mp.Queue(2) pp = mp.Process(target=self._init, args=(q, )) pp.start() pp.join() self.x_min = self.w_min = q.get() self.f_min = q.get() else: log.info('Initializing using CPU') self.x_min, self.f_min = self._init() # Pre-calculate matrix products to accelerate gradient and function value evaluations # After computing minimum to reduce memory copy self.H_list = np.einsum('ikj,ikl->ijl', self.X, self.X) / self.m self.X_T_Y_list = np.einsum('ikj,ik->ij', self.X, self.Y) / self.m log.info( 'beta = %.4f', np.linalg.norm(self.H_list - self.H, ord=2, axis=(1, 2)).max()) log.info('Initialization done')
def grad_check(self): '''Check whether the full gradient equals to the gradient computed by finite difference at a random point.''' w = np.random.randn(self.dim) delta = np.zeros(self.dim) grad = np.zeros(self.dim) eps = 1e-4 for i in range(self.dim): delta[i] = eps grad[i] = (self.f(w + delta) - self.f(w - delta)) / 2 / eps delta[i] = 0 if np.linalg.norm(grad - self.grad(w)) > eps: log.warn('Gradient implementation check failed!') return False else: log.info('Gradient implementation check succeeded!') return True
def __init__(self, n_hidden=64, dataset='mnist', **kwargs): super().__init__(dataset=dataset, **kwargs) self.n_hidden = n_hidden # Number of neurons in hidden layer self.n_class = self.Y_train.shape[1] self.img_dim = self.X_train.shape[1] self.dim = (n_hidden + 1) * (self.img_dim + self.n_class) self.Y_train_labels = self.Y_train.argmax(axis=1) self.Y_test_labels = self.Y_test.argmax(axis=1) # Internal buffers self._dw = np.zeros(self.dim) self._dw1, self._dw2 = self.unpack_w( self._dw) # Reference to the internal buffer log.info('Initialization done')
def _init(self, result_queue=None): if xp.__name__ == 'cupy': self.cuda() log.info('Computing norm') norm = xp.linalg.norm(self.X_train, 2) / ( 2 * xp.sqrt(self.m_total)) # Upper bound of the hessian self.X_train /= norm self.X /= norm if self.kappa is not None: log.info('Computing min') x_min, count = NAG(self.grad, xp.random.randn(self.dim), self.L, self.sigma, n_iters=5000, eps=1e-10) log.info(f'NAG ran for {count} iterations') f_min = self.f(x_min) log.info(f'f_min = {f_min}') log.info(f'grad_f(x_min) = {xp.linalg.norm(self.grad(x_min))}') if xp.__name__ == 'cupy': norm = norm.item() if self.kappa is not None: x_min = x_min.get() f_min = f_min.item() else: x_min = f_min = None if result_queue is not None: result_queue.put(norm) if self.kappa is not None: result_queue.put(x_min) result_queue.put(f_min) if self.kappa is not None: return norm, x_min, f_min return norm, None, None
def load(self): if not os.path.exists(self.cache_path): log.info('Downloading %s dataset' % self.name) os.system('mkdir -p %s' % self.data_dir) self.download() data = self.load_raw() np.savez_compressed(self.cache_path, X_train=data[0], Y_train=data[1], X_test=data[2], Y_test=data[3]) return data else: log.info('Loading %s dataset from cached file' % self.name) data = np.load(self.cache_path, allow_pickle=True) return [ data[key] for key in ['X_train', 'Y_train', 'X_test', 'Y_test'] ]
def load(self): if not os.path.exists(self.cache_path): log.info('Downloading %s dataset' % self.name) os.system('mkdir -p %s' % self.data_dir) self.download() self.load_raw() np.savez_compressed(self.cache_path, X_train=self.X_train, Y_train=self.Y_train, X_test=self.X_test, Y_test=self.Y_test) else: log.info('Loading %s dataset from cached file' % self.name) data = np.load(self.cache_path, allow_pickle=True) self.X_train = data['X_train'] self.Y_train = data['Y_train'] self.X_test = data['X_test'] self.Y_test = data['Y_test'] if self.normalize: self.normalize_data() return self.X_train, self.Y_train, self.X_test, self.Y_test
def __init__(self, kappa=None, noise_ratio=None, LAMBDA=0, alpha=0, **kwargs): self.noise_ratio = noise_ratio self.kappa = kappa self.alpha = alpha self.LAMBDA = LAMBDA super().__init__(**kwargs) if alpha == 0: if kappa == 1: self.LAMBDA = 100 elif kappa is not None: self.LAMBDA = 1 / (self.kappa - 1) self.L = 1 + self.LAMBDA self.sigma = self.LAMBDA if self.LAMBDA != 0 else None else: self.L = 1 + self.LAMBDA + 6 * self.alpha self.sigma = self.LAMBDA + 2 * self.alpha if xp.__name__ == 'cupy': log.info('Initializing using GPU') q = mp.Queue(3) pp = mp.Process(target=self._init, args=(q, )) pp.start() pp.join() norm = q.get() if self.kappa is not None: self.x_min = self.w_min = q.get() self.f_min = q.get() else: log.info('Initializing using CPU') norm, self.x_min, self.f_min = self._init() self.X_train /= norm self.X_test /= norm log.info('Initialization done')
# Experiment 1: Gisette classification p = LogisticRegression(n_agent, graph_type='er', graph_params=0.3, dataset='gisette', alpha=0.001) dim = p.dim os.system('mkdir data figs') if os.path.exists('data/gisette_initialization.npz'): x_0 = np.load('data/gisette_initialization.npz').get('x_0') else: x_0 = np.random.rand(dim, n_agent) np.savez('data/gisette_initialization.npz', x_0=x_0) x_0_mean = x_0.mean(axis=1) # Experiment 1.1: er topology W, alpha = generate_mixing_matrix(p) log.info('alpha = %.4f', alpha) exps = [ DSGD(p, n_iters=20000, eta=1, x_0=x_0, W=W, diminishing_step_size=True), DESTRESS(p, n_iters=300, n_inner_iters=10, eta=1, K_in=2, K_out=2, batch_size=10, x_0=x_0, W=W), GT_SARAH(p, n_iters=300, n_inner_iters=10, batch_size=10, eta=0.1, x_0=x_0, W=W), ] begin = time.time() exps = run_exp(exps, name='gisette-er', n_process=1, plot=False, save=True) end = time.time() log.info('Total %.2fs', end - begin) plot_gisette_exp(exps, 'er', p.m_total) # Experiment 1.2: grid topology
dim = 40 kappa = 10 mu = 5e-10 n_iters = 10 p = LinearRegression(n_agent=n_agent, m=m, dim=dim, noise_variance=1, kappa=kappa, graph_type='er', graph_params=0.3) W, alpha = generate_mixing_matrix(p) log.info('m = %d, n = %d, alpha = %.4f' % (m, n_agent, alpha)) x_0 = np.random.rand(dim, n_agent) x_0_mean = x_0.mean(axis=1) eta_2 = 2 / (p.L + p.sigma) eta_1 = 1 / p.L n_inner_iters = 100 n_sarah_iters = n_iters * 20 n_dgd_iters = n_iters * 20 batch_size = int(m / 100) n_dsgd_iters = int(n_iters * m / batch_size) centralized = [ GD(p, n_iters=n_iters, eta=eta_2, x_0=x_0_mean),
if __name__ == '__main__': import matplotlib.pyplot as plt n = 10 m = 1000 dim = 10 noise_variance = 0.01 p = LinearRegression(n, m, dim, noise_variance=noise_variance, n_edges=4 * n, balanced=False) log.info(p.m) p.grad_check() p.distributed_check() # p = LinearRegression(n, m, dim, noise_variance=noise_variance, n_edges=4*n) p.plot_graph() log.info('w_min = ' + str(p.w_min)) log.info('f(w_min) = ' + str(p.f(p.w_min))) log.info('f_0(w_min) = ' + str(p.f(p.w_min, 0))) log.info('|| g(w_min) || = ' + str(np.linalg.norm(p.grad(p.w_min)))) log.info('|| g_0(w_min) || = ' + str(np.linalg.norm(p.grad(p.w_min, 0)))) plt.show()
def distributed_check(self): '''Check the distributed function and gradient implementations are correct.''' def _check_1d_gradient(): w = xp.random.randn(self.dim) g = self.grad(w) g_i = g_ij = 0 res = True for i in range(self.n_agent): _tmp_g_i = self.grad(w, i) _tmp_g_ij = 0 for j in range(self.m): _tmp_g_ij += self.grad(w, i, j) if xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m) > 1e-5: log.warn( 'Distributed graident check failed! Difference between local graident at agent %d and average of all local sample gradients is %.4f' % (i, xp.linalg.norm(_tmp_g_i - _tmp_g_ij / self.m))) res = False g_i += _tmp_g_i g_ij += _tmp_g_ij g_i /= self.n_agent g_ij /= self.m_total if xp.linalg.norm(g - g_i) > 1e-5: log.warn( 'Distributed gradient check failed! Difference between global graident and average of local gradients is %.4f', xp.linalg.norm(g - g_i)) res = False if xp.linalg.norm(g - g_ij) > 1e-5: log.warn( 'Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f' % xp.linalg.norm(g - g_ij)) res = False return res def _check_2d_gradient(): res = True w_2d = xp.random.randn(self.dim, self.n_agent) g_1d = 0 for i in range(self.n_agent): g_1d += self.grad(w_2d[:, i], i=i) g_1d /= self.n_agent g_2d = self.grad(w_2d).mean(axis=1) if xp.linalg.norm(g_1d - g_2d) > 1e-5: log.warn( 'Distributed graident check failed! Difference between global gradient and average of distributed graidents is %.4f' % xp.linalg.norm(g_1d - g_2d)) res = False g_2d_sample = self.grad(w_2d, j=xp.arange(self.m).reshape(-1, 1).repeat( self.n_agent, axis=1).T).mean(axis=1) if xp.linalg.norm(g_1d - g_2d_sample) > 1e-5: log.warn( 'Distributed graident check failed! Difference between global graident and average of all sample gradients is %.4f' % xp.linalg.norm(g_1d - g_2d_sample)) res = False samples = xp.random.randint(0, self.m, (self.n_agent, 10)) g_2d_stochastic = self.grad(w_2d, j=samples) for i in range(self.n_agent): g_1d_stochastic = self.grad(w_2d[:, i], i=i, j=samples[i]) if xp.linalg.norm(g_1d_stochastic - g_2d_stochastic[:, i]) > 1e-5: log.warn( 'Distributed graident check failed! Difference between distributed stoachastic gradient at agent %d and average of sample gradients is %.4f' % (i, xp.linalg.norm(g_1d_stochastic - g_2d_stochastic[:, i]))) res = False return res def _check_function_value(): w = xp.random.randn(self.dim) f = self.f(w) f_i = f_ij = 0 res = True for i in range(self.n_agent): _tmp_f_i = self.f(w, i) _tmp_f_ij = 0 for j in range(self.m): _tmp_f_ij += self.f(w, i, j) if xp.abs(_tmp_f_i - _tmp_f_ij / self.m) > 1e-10: log.warn( 'Distributed function value check failed! Difference between local function value at agent %d and average of all local sample function values %d is %.4f' % (i, i, xp.abs(_tmp_f_i - _tmp_f_ij / self.m))) res = False f_i += _tmp_f_i f_ij += _tmp_f_ij f_i /= self.n_agent f_ij /= self.m_total if xp.abs(f - f_i) > 1e-10: log.warn( 'Distributed function value check failed! Difference between the global function value and average of local function values is %.4f' % xp.abs(f - f_i)) res = False if xp.abs(f - f_ij) > 1e-10: log.warn( 'Distributed function value check failed! Difference between the global function value and average of all sample function values is %.4f' % xp.abs(f - f_ij)) res = False return res res = _check_function_value() & _check_1d_gradient( ) & _check_2d_gradient() if res: log.info('Distributed check succeeded!') return True else: return False
n_agent = 20 n_iters = 20 p = LogisticRegression(n_agent, graph_type='er', graph_params=0.3, dataset='gisette', alpha=0.001) x_0 = np.random.rand(p.dim, n_agent) x_0_mean = x_0.mean(axis=1) batch_size = int(np.sqrt(p.m)) n_inner_iters = 10 W, alpha = generate_mixing_matrix(p) log.info('alpha = %.4f', alpha) exps = [ GD(p, n_iters=n_iters, eta=100, x_0=x_0, W=W), ] exps = run_exp(exps, max_iter=n_iters, name='gisette', n_process=2, plot=True, save=False) plt.show()