def least_squares_SGD(y, tx, initial_w, max_iters, gamma): '''Apply stochastic gradient descent method to minimize mean squared loss function for labels y and training data tx Parameters: y = labels, numpy column vector tx = data in matrix form (with first column = 1 for bias), one data entry per row numpy multidimensional array, initial_w = initial values for the weights, numpy column vector max_iters = number of steps for the stochastic gradient descent method must be >0 to return meaningful loss gamma = learning step Returns the weights corresponding to the last step ''' assert max_iters > 0 w = initial_w for i in range(max_iters): for minibatch_y, minibatch_x in batch_iter(y, tx, batch_size=1, num_batches=1): # compute loss and gradient loss = compute_mse_loss(y, tx, w) gradient = compute_mean_squares_gradient(minibatch_y, minibatch_x, w) # update parameters w = w - gamma * gradient return (w, loss)
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma): """Stochastic gradient descent.""" # Define parameters to store w and loss ws = [initial_w] losses = [] w = initial_w for n_iter in range(max_iters): for y_batch, tx_batch in batch_iter(y, tx, batch_size=batch_size, num_batches=1): # compute a stochastic gradient and loss grad, _ = compute_stoch_gradient(y_batch, tx_batch, w) # update w through the stochastic gradient update w = w - gamma * grad # calculate loss loss = compute_loss(y, tx, w) # store w and loss ws.append(w) losses.append(loss) print("SGD({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format( bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) return losses, ws
def least_squares_SGD(y, tx, initial_w, max_iters, gamma, batch_size=1, lambda_=0, min_loss_threshold = 0): """ Linear regression using stochastic gradient descent """ print_step = np.maximum(int(max_iters/10), 1) # print status of gradient descent whenever a multiple of this w = initial_w loss_change = min_loss_threshold + 1 loss = compute_loss_least_squares(y, tx, w, lambda_) n_iter = 0 while (n_iter < max_iters) and (loss_change > min_loss_threshold): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size): if n_iter >= max_iters: break grad = compute_gradient_least_squares(minibatch_y, minibatch_tx, w, lambda_) w = w - gamma * grad old_loss = loss loss = compute_loss_least_squares(y, tx, w, lambda_) loss_change = np.max(np.abs(loss - old_loss)) if loss_change <= min_loss_threshold: break if n_iter % print_step == 0: print("Gradient Descent({bi}/{ti}): changeInLoss={lc}, loss={l}, w0={w0}, w1={w1}".format( lc = loss_change, bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) n_iter = n_iter + 1 return (w, loss)
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma, loss_type=Loss.MSE): """Stochastic gradient descent algorithm.""" # *************************************************** # INSERT YOUR CODE HERE # TODO: implement stochastic gradient descent. # *************************************************** ws = [initial_w] losses = [compute_loss(y, tx, initial_w, loss_type)] w = initial_w for n_iter in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size): # Ln(w) gradient_n = compute_stoch_gradient(minibatch_y, minibatch_tx, w, loss_type) w = w - gamma * gradient_n loss = compute_loss(y, tx, w, loss_type) # store w and loss ws.append(w) losses.append(loss) print("Stochastic Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format( bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) return losses, ws
def least_squares_SGD(y, x, gamma, max_iters, B=1, init_guess=None): """ Estimate parameters of linear system using stochastic least squares gradient descent. In: x (NxD): Input matrix y (Nx1): Output vector init_guess (Dx1): Initial guess gamma: step_size B: batch size max_iters: Max number of iterations Where N and D are respectively the number of samples and dimension of input vectors Out: Estimated parameters """ if (init_guess == None): init_guess = np.zeros((x.shape[1], 1)) N = x.shape[0] w = list() w.append(init_guess) for minibatch_y, minibatch_x in hp.batch_iter(y, x, B, num_batches=max_iters, shuffle=True): w.append(w[-1] - gamma * comp_ls_gradient(N, minibatch_x, minibatch_y - np.dot(minibatch_x, w[-1]))) return w[-1]
def least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma): """calculate the least squares solution using stochastic gradient descent.""" w = initial_w for n_iter in range(max_iters): for minibatch_y, minibatch_tx in helpers.batch_iter(y, tx, batch_size,1): grad = compute_gradient(y,tx,w) w = w - gamma*grad return (w, compute_mse(y,tx,w))
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma): w = initial_w for n_iter in range(max_iters): for y_batch, tx_batch in batch_iter(y,tx, batch_size=1, num_batches=1): grad = compute_logistic_gradient(y_batch, tx_batch, w, lambda_) w = w - gamma * grad loss = compute_logistic_loss(y, tx, w, lambda_) return w, loss
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): w = initial_w for n_iter in range(max_iters): for y_batch, tx_batch in batch_iter(y,tx, batch_size=1, num_batches=1): grad = compute_gradient(y_batch,tx_batch,w) w = w - gamma * grad loss = compute_mse(y,tx,w) return w, loss
def model_predictions(model, data, vocab, DEVICE, BATCH_SIZE=16): """ model: an instance of BertSCLSTM data: list of tuples, with each tuple consisting of correct and incorrect sentence string (would be split at whitespaces) """ topk = 1 # print("###############################################") inference_st_time = time.time() final_sentences = [] VALID_BATCH_SIZE = BATCH_SIZE # print("data size: {}".format(len(data))) data_iter = batch_iter(data, batch_size=VALID_BATCH_SIZE, shuffle=False) model.eval() model.to(DEVICE) for batch_id, (batch_labels, batch_sentences) in enumerate(data_iter): # set batch data for bert batch_labels_, batch_sentences_, batch_bert_inp, batch_bert_splits = bert_tokenize_for_valid_examples( batch_labels, batch_sentences) if len(batch_labels_) == 0: print("################") print( "Not predicting the following lines due to pre-processing mismatch: \n" ) print([(a, b) for a, b in zip(batch_labels, batch_sentences)]) print("################") continue else: batch_labels, batch_sentences = batch_labels_, batch_sentences_ batch_bert_inp = {k: v.to(DEVICE) for k, v in batch_bert_inp.items()} # set batch data for others batch_labels_ids, batch_lengths = labelize(batch_labels, vocab) batch_idxs, batch_lengths_ = sclstm_tokenize(batch_sentences, vocab) assert (batch_lengths_ == batch_lengths).all() == True assert len(batch_bert_splits) == len(batch_idxs) batch_idxs = [batch_idxs_.to(DEVICE) for batch_idxs_ in batch_idxs] batch_lengths = batch_lengths.to(DEVICE) batch_labels_ids = batch_labels_ids.to(DEVICE) # forward with torch.no_grad(): """ NEW: batch_predictions can now be of shape (batch_size,batch_max_seq_len,topk) if topk>1, else (batch_size,batch_max_seq_len) """ _, batch_predictions = model(batch_idxs, batch_lengths, batch_bert_inp, batch_bert_splits, targets=batch_labels_ids, topk=topk) batch_predictions = untokenize_without_unks(batch_predictions, batch_lengths, vocab, batch_labels) final_sentences.extend(batch_predictions) # print("total inference time for this data is: {:4f} secs".format(time.time()-inference_st_time)) return final_sentences
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_epochs, gamma, compute_stoch_gradient): """Stochastic gradient descent algorithm.""" w = initial_w losses = np.zeros(max_epochs) ws = np.zeros((max_epochs, w.shape[0])) for i in range(max_epochs): generator = batch_iter(y, tx, batch_size) y_n, tx_n = next(generator) g = compute_stoch_gradient(y_n, tx_n, w) w = w - gamma * g ws[i] = w losses[i] = compute_cost(y, tx, w) return losses, ws
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): # initializing the weights, the batch size and the number of batches w = initial_w batch_size = 1 num_batches = 1 for i in range(max_iters): # iterating for each batch for y_batch, tx_batch in batch_iter(y, tx, batch_size, num_batches): # computing the gradient gradient = compute_gradient(y_batch, tx_batch, w) # updating the weights w = w - gamma * gradient # return w with the corresponding loss return w, compute_loss(y, tx, w)
def least_squares_SGD(y, tx, initial_w, batch_size, max_iters, gamma): # *************************************************** w = initial_w for n_iter in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches=1): gradient = compute_gradient(minibatch_y, minibatch_tx, w) # update w by gradient w = w - gamma * gradient # computes the new w(t+1) loss = compute_loss(y, tx, w) return w, loss
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): """Stochastic gradient descent algorithm.""" w = initial_w for n_iter in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, 1): # compute gradient and loss gradient = compute_gradient(minibatch_y, minibatch_tx, w) # update w by gradient w = w - gamma * gradient return w, compute_loss_MSE(y, tx, w)
def test_batch_iter(self): """ Tests batching function """ from helpers import batch_iter import scipy.sparse as sp A = sp.csr_matrix( np.array([[1., 2., 3.], [0., -1., 1.], [3., 4., 5.], [1., 2., 3.], [0., -1., 1.], [3., 4., 5.], [1., 2., 3.], [0., -1., 1.], [3., 4., 5.]])) B = list(batch_iter(A, A, 2)) self.assertEqual(len(B), 5)
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): """Stochastic gradient descent algorithm.""" batch_size = 5000 w = initial_w for n_iter in range(max_iters): y_, tx_ = batch_iter(y, tx, batch_size).__next__() gradient = compute_gradient(y_, tx_, w) w = w - gamma * gradient if n_iter % 3 == 0: gamma = gamma / 1.2 loss = compute_loss(y, tx, w) # loss = calculate_nll(y, tx, w) return w, loss
def model_predictions(model, data, vocab, DEVICE, BATCH_SIZE=16): """ model: an instance of ElmoSCTransformer data: list of tuples, with each tuple consisting of correct and incorrect sentence string (would be split at whitespaces) """ topk = 1 print("###############################################") inference_st_time = time.time() final_sentences = [] VALID_BATCH_SIZE = BATCH_SIZE print("data size: {}".format(len(data))) data_iter = batch_iter(data, batch_size=VALID_BATCH_SIZE, shuffle=False) model.eval() model.to(DEVICE) for batch_id, (batch_clean_sentences, batch_corrupt_sentences) in enumerate(data_iter): # set batch data batch_labels, batch_lengths = labelize(batch_clean_sentences, vocab) batch_idxs, batch_lengths_, inverted_mask = sctrans_tokenize( batch_corrupt_sentences, vocab) assert (batch_lengths_ == batch_lengths).all() == True batch_idxs = [batch_idxs_.to(DEVICE) for batch_idxs_ in batch_idxs] batch_lengths = batch_lengths.to(DEVICE) batch_labels = batch_labels.to(DEVICE) inverted_mask = inverted_mask.to(DEVICE) batch_elmo_inp = elmo_batch_to_ids( [line.split() for line in batch_corrupt_sentences]).to(DEVICE) # forward with torch.no_grad(): """ NEW: batch_predictions can now be of shape (batch_size,batch_max_seq_len,topk) if topk>1, else (batch_size,batch_max_seq_len) """ _, batch_predictions = model(batch_idxs, inverted_mask, batch_lengths, batch_elmo_inp, targets=batch_labels, topk=topk) batch_predictions = untokenize_without_unks(batch_predictions, batch_lengths, vocab, batch_clean_sentences) final_sentences.extend(batch_predictions) print("total inference time for this data is: {:4f} secs".format( time.time() - inference_st_time)) return final_sentences
def logistic_regression(y, tx, initial_w, max_iters, gamma): """ @param gamma: step size @param max_iters: maximum nuber of iterations @return : optimal weights, minimum mse """ batch_size = 10000 losses = [] w = initial_w y_batch = np.zeros((batch_size, 1)) for iter in range(max_iters): batch = batch_iter(y, tx, batch_size, num_batches=1, shuffle=True) y_batch[:, 0], tx_batch = next(batch) loss, w = log_gradient_descent(y_batch, tx_batch, w, gamma) losses.append(loss) # print("Current iteration={i}, the loss={l}".format(i=iter, l=loss)) return w, loss
def model_predictions(model, data, vocab, DEVICE, BATCH_SIZE=16): """ model: an instance of CharLSTMWordLSTMModel data: list of tuples, with each tuple consisting of correct and incorrect sentence string (would be split at whitespaces) """ topk = 1 print("###############################################") inference_st_time = time.time() final_sentences = [] VALID_BATCH_SIZE = BATCH_SIZE print("data size: {}".format(len(data))) data_iter = batch_iter(data, batch_size=VALID_BATCH_SIZE, shuffle=False) model.eval() model.to(DEVICE) for batch_id, (batch_clean_sentences, batch_corrupt_sentences) in tqdm(enumerate(data_iter)): # set batch data batch_labels, batch_lengths = labelize(batch_clean_sentences, vocab) batch_idxs, batch_lengths_, batch_char_lengths = char_tokenize( batch_corrupt_sentences, vocab, return_nchars=True) assert (batch_lengths_ == batch_lengths).all() == True batch_idxs = [batch_idxs_.to(DEVICE) for batch_idxs_ in batch_idxs] batch_char_lengths = [ batch_char_lengths_.to(DEVICE) for batch_char_lengths_ in batch_char_lengths ] batch_lengths = batch_lengths.to(DEVICE) batch_labels = batch_labels.to(DEVICE) # forward with torch.no_grad(): # because topk=1, batch_predictions are of shape (batch_size,batch_max_seq_len) _, batch_predictions = model(batch_idxs, batch_char_lengths, batch_lengths, targets=batch_labels, topk=topk) batch_predictions = untokenize_without_unks(batch_predictions, batch_lengths, vocab, batch_clean_sentences) final_sentences.extend(batch_predictions) print("total inference time for this data is: {:4f} secs".format( time.time() - inference_st_time)) return final_sentences
def least_squares_SGD(y, x, initial_w, max_iters, gamma, mae=False, threshold=1e-5): """ Implementation of the Stochastic Gradient Descent optimization algorithm for linear regression Can be run with both MSE and MAE loss :param x: data matrix, numpy ndarray with shape with shape (N, D), where N is the number of samples and D is the number of features :param y: vector of target values, numpy array with dimensions (N, 1) :param initial_w: vector of initial weights, numpy array with dimensions (D, 1) :param max_iters: how many iterations to run the algorithm, integer :param gamma: learning rate, positive float value :param mae: whether to use MAE loss, boolean, optional, the default value is False :param threshold: convergence threshold, positive float value :returns: (final weights, final loss value), tuple """ # Set the initial values for the weights w = initial_w # Compute the initial loss value prev_loss = compute_loss(y, x, initial_w, mae) # Use the helper function batch_iter from Exercise 2, # to get a random sample from the data in the form (y_n, x_n) for each iteration for n_iter in range(max_iters): for y_n, x_n in batch_iter(y, x, batch_size=1, num_batches=1): # Compute the gradient for only one sample (or subgradient if MAE loss is used) grd = compute_subgradient_mae(y_n, x_n, w) if mae else compute_gradient_mse(y_n, x_n, w) # Update the weights using the gradient and learning rate w = w - gamma * grd # Compute the current loss and test convergence loss = compute_loss(y, x, w, mae) if abs(loss - prev_loss) < threshold: print(f'converged at iter : {n_iter}') break prev_loss = loss.copy() # Compute the final loss value loss = compute_loss(y, x, w, mae) return w, loss
def reg_logistic_regression(y, tx, lambda_, initial_w, max_iters, gamma): """ @param gamma: learning rate @param max_iters: maximum nuber of iterations @param batch_size: the size of batchs used for calculating the stochastic gradient @return : optimal weights, minimum mse """ batch_size = y.shape[0] / 10 losses = [] w = np.zeros((tx.shape[1], 1)) y_batch = np.zeros((batch_size, 1)) for iter in range(max_iters): batch = batch_iter(y, tx, batch_size, num_batches=1, shuffle=True) y_batch[:, 0], tx_batch = next(batch) loss, w = reg_log_gradient_descent(y_batch, tx_batch, w, gamma, lambda_) losses.append(loss) # print("Current iteration={i}, the loss={l}".format(i=iter, l=loss)) return w, loss
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma): """Stochastic gradient descent algorithm.""" ws = [initial_w] losses = [compute_loss(y, tx, initial_w)] w = initial_w for y_batch, tx_batch in batch_iter(y, tx, batch_size, max_iters): gradient = compute_gradient(y, tx, w) w = w - gamma * gradient loss = compute_loss(y, tx, w) # store w and loss ws.append(w) losses.append(loss) print("Stochastic Gradient Descent: loss={l}, w0={w0}, w1={w1}".format( l=loss, w0=w[0], w1=w[1])) return losses, ws
def running_gradient(y, tx, w, lambda_, method='penalized'): """ run gradient descent, using logistic regression, penalized log regression or newton method. Return the loss and final weights. """ # *************************************************** max_iter = 5000 gamma = 0.01 threshold = 1e-8 losses = [] batch_size = 5000 n_iter = 0 # start gradient descent for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches=max_iter): # get loss and update w. if method == 'penalized': loss, w = learning_by_penalized_gradient(minibatch_y, minibatch_tx, w, gamma, lambda_) if method == 'newton': loss, w = learning_by_newton_method(minibatch_y, minibatch_tx, w, gamma) if method == 'gradient': loss, w = learning_by_gradient_descent(minibatch_y, minibatch_tx, w, gamma) # log info if n_iter % 10 == 0: #print(w) print("Current iteration={i}, loss={l}".format(i=n_iter, l=loss)) # converge criterion #if len(losses) == 1000: # break losses.append(loss) if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold: break n_iter += 1 return w
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): batch_size = 1 # Define parameters to store w and loss loss = 0 w = initial_w for n_iter, [minib_y, minib_tx ] in enumerate(batch_iter(y, tx, batch_size, max_iters)): grad = compute_gradient(minib_y, minib_tx, w) loss = compute_mse(y, tx, w) if n_iter % 100 == 0: print("Current iteration={i}, loss={l}".format(i=n_iter, l=loss)) w = w - gamma * grad return w, loss
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): """Stochastic gradient descent algorithm.""" if len(initial_w.shape) == 2: initial_w = initial_w.reshape((max(initial_w.shape))) if len(y.shape) == 2: y = y.reshape((max(y.shape))) batch_size = 5000 w = initial_w for n_iter in range(max_iters): y_, tx_ = batch_iter(y, tx, batch_size).__next__() gradient = compute_gradient(y_, tx_, w) w = w - gamma * gradient if n_iter % 3 == 0: gamma = gamma / 1.2 loss = compute_loss(y, tx, w) return w, loss
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_epochs, gamma): ws = [initial_w] losses = [] w = initial_w n_iter = 0 for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, max_epochs): grad = compute_stoch_gradient(minibatch_y, minibatch_tx, w) loss = co.compute_loss(y, tx, w) w = w - gamma * grad # store w and loss ws.append(np.copy(w)) losses.append(loss) n_iter += 1 print("Gradient Descent({bi}/{ti}): loss={l}".format(bi=max_epochs - 1, ti=max_epochs - 1, l=loss)) return losses, ws
def stochastic_gradient_descent(y, tx, initial_w, batch_size, max_iters, gamma): """Stochastic gradient descent algorithm.""" # Define parameters to store w and loss ws = [initial_w] losses = [] w = initial_w for n_iter in range(max_iters): # get a random minibatch of data for minibatch_y, minibatch_x in batch_iter(y, tx, batch_size): grad = compute_stoch_gradient(minibatch_y, minibatch_x, w) loss = compute_loss(minibatch_y, minibatch_x, w) w = w - gamma * grad # store w and loss ws.append(w) losses.append(loss) print( "Stochastic Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}" .format(bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) return losses, ws
def stochastic_gradient_descent( y, tx, initial_w, batch_size, max_iters, gamma, loss_type): """Stochastic gradient descent algorithm.""" ws = [initial_w] losses = [] w = initial_w batch_size = 5 for n_iter in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size): grad = compute_stoch_gradient(minibatch_y, minibatch_tx, w, loss_type) loss = compute_loss(y, tx, w, loss_type) # Update w = w - gamma*grad # Store ws.append(w) losses.append(loss) print("Stochastic Gradient Descent({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format( bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) return losses, ws
def stochastic_subgradient_descent(y, tx, initial_w, batch_size, max_iters, gamma, ltype="MAE"): """Stochastic gradient descent algorithm.""" # *************************************************** # implement stochastic gradient descent. w = initial_w g = 0 num_batches = 1 for n_iter in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size, num_batches): g = compute_subgradient(minibatch_y, minibatch_tx, w) # update w by gradient w = w - gamma * g # computes the new w(t+1) loss = compute_loss_subgradient(y, tx, w) # compute final error return w, loss
def stochastic_gradient_descent( y, tx, initial_w, batch_size, max_iters, gamma): """Stochastic gradient descent.""" # Define parameters to store w and loss ws = [initial_w] losses = [] w = initial_w for n_iter in range(max_iters): for y_batch, tx_batch in batch_iter(y, tx, batch_size=batch_size, num_batches=1): # compute a stochastic gradient and loss grad, _ = compute_stoch_gradient(y_batch, tx_batch, w) # update w through the stochastic gradient update w = w - gamma * grad # calculate loss loss = compute_loss(y, tx, w) # store w and loss ws.append(w) losses.append(loss) print("SGD({bi}/{ti}): loss={l}, w0={w0}, w1={w1}".format( bi=n_iter, ti=max_iters - 1, l=loss, w0=w[0], w1=w[1])) return losses, ws
def least_squares_sgd(y, tx, initial_w, max_iters, gamma): """ Linear regression using stochastic gradient descent """ # if initial_w is None, we initialize it to a zeros vector if (initial_w is None): initial_w = np.zeros(tx.shape[1]) # Define parameters of the algorithm batch_size = 1 # Define parameters to store w and loss loss = 0 w = initial_w for n_iter, [mb_y, mb_tx] in enumerate(batch_iter(y, tx, batch_size, max_iters)): # compute gradient and loss gradient = compute_gradient(mb_y, mb_tx, w) loss = compute_loss(y, tx, w) # update w by gradient w -= gamma * gradient return w, loss
def least_squares_SGD(y, tx, initial_w, max_iters, gamma): """ Linear regression using stochastic gradient descent @param gamma: step size @param max_iters: maximum number of iterations @param batch_size: the size of batchs used for calculating the stochastic gradient @return: optimal weights, minimum mse """ batch_size = 5000 ws = [initial_w] losses = [] w = initial_w for i in range(max_iters): for minibatch_y, minibatch_tx in batch_iter(y, tx, batch_size): stoch_gradient = compute_gradient(minibatch_y, minibatch_tx, w) loss = compute_loss(y, tx, w) w = w - gamma * stoch_gradient # store w and loss ws.append(np.copy(w)) losses.append(loss) #print("SGD ({bi}/{ti}): loss={l}".format(bi=i, ti=max_iters - 1, l=loss)) min_loss = min(losses) w = ws[losses.index(min_loss)] return w, min_loss