def train_gd(a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, alpha: float): # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # the weights of our SVM classifier x = np.zeros(d) logger = Logger(algo_tag=rf"GD - $\alpha={alpha}$") for t in tqdm(range(1, T + 1)): # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 1 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x, alpha), train_err=error(a, b, x), test_err=error(a_test, b_test, x), ) if alpha == 0: # our problem is simply convex (as the hinge loss is a convex function) eta_t = 1 / np.sqrt(t) else: # thanks to the regularization, our problem is alpha strongly convex # eta_t = 2 / (alpha * (t + 1)) eta_t = 1 / (alpha * t) grad = hinge_loss_grad(a, b, x, alpha) x = x - eta_t * grad return x, logger
def train_ons( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, gamma: float, alpha: float, radius: float, seed=0, ): np.random.seed(seed) # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # the weights of our SVM classifier # x is the averaged weights (online to batch conversion) x_avg = np.zeros(d) x = np.zeros(d) y = np.zeros(d) A = 1 / gamma**2 * np.eye(d) A_inv = gamma**2 * np.eye(d) logger = Logger( algo_tag=rf"ONS - $\alpha = {alpha} - \gamma = {gamma} - z={radius}$") I = np.random.randint(0, n, T) for t in tqdm(range(1, T + 1)): # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 0 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, alpha), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), ) grad = hinge_loss_grad(a_, b_, x, alpha) gg = np.outer(grad, grad) assert gg.shape == (d, d) A += gg num = A_inv.dot(gg).dot(A_inv) denum = 1 + grad.dot(A_inv).dot(grad) A_inv -= num / denum y = x - 1 / gamma * A_inv.dot(grad) x, d_0, theta = l1_ball_proj_weighted(y, radius, np.diag(A)) # averaging x_avg = (x_avg * (t - 1) + x) / t return x, logger
def train_sreg_pm( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, radius: float, seed=0, ): np.random.seed(seed) a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape x_avg = np.zeros(d) x = np.zeros(d) w = np.ones(2 * d) / (2 * d) directions = np.arange(d) I = np.random.randint(0, n, T) logger = Logger(algo_tag=rf"SREG $\pm$ proj - $z={radius}$") for t in tqdm(range(1, T + 1)): direction = random.choices(directions, weights=0.5 * (w[:d] + w[d:]))[0] # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 1 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, 0), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), ) eta_t = 1 / np.sqrt(t) grad = hinge_loss_grad_partial(a_, b_, x, 0, direction) if grad > 0: if abs(w[direction]) > 1e-9: w[direction] = np.exp( -eta_t * grad / w[direction]) * w[direction] else: if abs(w[direction + d]) > 1e-9: w[direction + d] = (np.exp(eta_t * grad / w[direction + d]) * w[direction + d]) w = w / np.sum(w) x = radius * (w[:d] - w[d:]) # averaging x_avg = (x_avg * (t - 1) + x) / t return x, logger
def train_sgd_proj( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, alpha: float, radius: float, seed=0, ): np.random.seed(seed) # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # x_avg is the averaged weights (online to batch conversion) # x is weight (online version) x_avg = np.zeros(d) x = np.zeros(d) logger = Logger(algo_tag=rf"SGDproj - $\alpha={alpha} - z={radius}$") I = np.random.randint(0, n, T) for t in tqdm(range(1, T + 1)): # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 1 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, alpha), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), ) if alpha == 0: # our problem is convex (as the hinge loss is a convex function) eta_t = 1 / np.sqrt(t) else: # eta_t = 2 / (alpha * t) eta_t = 1 / (alpha * t) grad = hinge_loss_grad(a_, b_, x, alpha) x = x - eta_t * grad x, d_0, theta = l1_ball_proj(x, radius) # averaging x_avg = (x_avg * (t - 1) + x) / t return x_avg, logger
def train_seg_pm( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, radius: float, seed=0, ): np.random.seed(seed) # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # the weights of our SVM classifier # x is the averaged weights (online to batch conversion) x_avg = np.zeros(d) x = np.zeros(d) theta = np.zeros(2 * d) w = np.zeros(2 * d) logger = Logger(algo_tag=rf"Seg +- proj - $z={radius}$") I = np.random.randint(0, n, T) for t in tqdm(range(1, T + 1)): # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 1 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, 0), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), ) eta_t = 1 / np.sqrt(t) grad = hinge_loss_grad(a_, b_, x, 0) theta[:d] = theta[:d] - eta_t * grad theta[d:] = theta[d:] + eta_t * grad w = softmax(theta) x = radius * (w[:d] - w[d:]) # averaging x_avg = (x_avg * (t - 1) + x) / t return x, logger
def train_adagrad( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, radius: float, seed=0, ): np.random.seed(seed) # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # the weights of our SVM classifier # x is the averaged weights (online to batch conversion) x_avg = np.zeros(d) x = np.zeros(d) y = np.zeros(d) DELTA = 1e-5 S = np.ones(d) * DELTA logger = Logger(algo_tag=rf"Adagrad - $z={radius}$") I = np.random.randint(0, n, T) for t in tqdm(range(1, T + 1)): # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if t % int(10**k) == 1 or t < 10: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, 0), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), ) grad = hinge_loss_grad(a_, b_, x, 0) S += grad**2 D = np.diag(np.sqrt(S)) D_inv = np.diag(1 / np.sqrt(S)) y = x - D_inv.dot(grad) x, d_0, theta = l1_ball_proj_weighted(y, radius, np.diag(D)) # averaging x_avg = (x_avg * (t - 1) + x) / t return x, logger
def train_hogwild( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, alpha: float, K: int, beta: float, theta: float, n_processes: int, sequential: bool, seed: int, use_logger=True, ): np.random.seed(seed) a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # create x using a shared memory, so that all processes can write to it x_memmap = os.path.join(folder, f'x_{datetime.now().strftime("%H%M%S")}') x = np.memmap(x_memmap, dtype=a.dtype, shape=d, mode="w+") print(f"Training hogwild with seed {seed}") if use_logger: logger = Logger( algo_tag= rf"Hogwild {'seq' if sequential else f'n_jobs={n_processes}'}- $K={K}$" ) t0 = perf_counter_ns() t = 1 eta_t = theta / alpha while t <= T: if use_logger: logger.log( iteration=t, loss=hinge_loss(a, b, x, alpha), train_err=error(a, b, x), test_err=error(a_test, b_test, x), eta_t=eta_t, time_elapsed=(perf_counter_ns() - t0) / 1e9, ) # don't do more steps than necessary if t + K * n_processes < T: steps_per_processor = K else: steps_per_processor = int((T - t) / n_processes) indices = [ np.random.randint(0, n, steps_per_processor) for p in range(n_processes) ] if sequential: # mimic Hogwild without multiprocessing (similar to SGD) for I_p in indices: train_epoch_hogwild(x, a, b, I_p, eta_t, alpha) else: Parallel(n_jobs=n_processes, verbose=0)( delayed(train_epoch_hogwild)(x, a, b, I_p, eta_t, alpha) for I_p in indices) # increase the number of steps and decrease the learning rate K = int(K / beta) eta_t = beta * eta_t # original learning rate from the paper t += steps_per_processor dt = (perf_counter_ns() - t0) / 1e9 # execution time in sec if use_logger: logger.log( iteration=t, loss=hinge_loss(a, b, x, alpha), train_err=error(a, b, x), test_err=error(a_test, b_test, x), eta_t=eta_t, time_elapsed=dt, ) return x, logger else: return dt, t, error(a_test, b_test, x)
def train_sgd( a: np.array, b: np.array, a_test: np.array, b_test: np.array, T: int, alpha: float, return_avg=True, seed=0, use_logger=True, ): np.random.seed(seed) # add a column of ones to the input data, to avoid having to define an explicit bias in our weights a = np.concatenate([a, np.ones((len(a), 1))], axis=1) a_test = np.concatenate([a_test, np.ones((len(a_test), 1))], axis=1) n, d = a.shape # x_avg is the averaged weights (online to batch conversion) # x is weight (online version) x_avg = np.zeros(d) x = np.zeros(d) if use_logger: logger = Logger(algo_tag=rf"SGD - {'x_avg' if return_avg else 'x_T'}") t0 = perf_counter_ns() I = np.random.randint(0, n, T) for t in tqdm(range(1, T + 1)): # pick random sample i = I[t - 1] a_, b_ = a[i][np.newaxis, :], np.array([b[i]]) if alpha == 0: # our problem is convex (as the hinge loss is a convex function) eta_t = 1 / np.sqrt(t) else: eta_t = 1 / (alpha * t) # log our results (before training, to match plots from the class) k = max(int(np.log10(t)), 0) if (t % int(10**k) == 1 or t < 10) and use_logger: logger.log( iteration=t, loss=hinge_loss(a, b, x_avg, alpha), train_err=error(a, b, x_avg), test_err=error(a_test, b_test, x_avg), eta_t=eta_t, time_elapsed=(perf_counter_ns() - t0) / 1e9, ) grad = hinge_loss_grad(a_, b_, x, alpha) x = x - eta_t * grad # averaging if return_avg: x_avg = (x_avg * (t - 1) + x) / t else: x_avg = x dt = (perf_counter_ns() - t0) / 1e9 # execution time in sec if use_logger: return x_avg, logger else: return dt, T, error(a_test, b_test, x)