def sjlt_error_vs_iterations(): n = 6_000 d = 200 gamma_vals = [5] #[4,6,8] sketch_size = int(gamma_vals[0] * d) col_sparsities = [1, 4, 16] number_iterations = 20 # 40 #np.asarray(np.linspace(5,40,8), dtype=np.int) # Output dictionaries error_to_lsq = {} #{sketch_name : {} for sketch_name in sketches} error_to_truth = {} #{sketch_name : {} for sketch_name in sketches} for s in col_sparsities: error_to_lsq[s] = [] error_to_truth[s] = [] print(error_to_lsq) print(error_to_truth) X, y, x_star = gaussian_design_unconstrained(n, d, variance=1.0) # Least squares estimator x_opt = np.linalg.lstsq(X, y)[0] lsq_vs_truth_errors = np.log(np.sqrt(prediction_error(X, x_opt, x_star))) for s in col_sparsities: col_sparsity = s print("Testing col sparsity: {}, num_iterations: {}".format( col_sparsity, number_iterations)) for sketch_method in sketches: #lsq_error, truth_error = 0,0 lsq_error = np.zeros((number_iterations, )) truth_error = np.zeros_like(lsq_error) my_ihs = ihs(X, y, sketch_method, sketch_size, col_sparsity) for trial in range(NTRIALS): print('*' * 80) print("{}, trial: {}".format(sketch_method, trial)) x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors( number_iterations) for _ in range(x_iters.shape[1]): lsq_error[_] += prediction_error(X, x_iters[:, _], x_opt) truth_error[_] += prediction_error(X, x_iters[:, _], x_star) print(lsq_error) # lsq_error += prediction_error(X,x_ihs, x_opt) # truth_error += prediction_error(X,x_ihs, x_star) mean_lsq_error = lsq_error / NTRIALS mean_truth_error = truth_error / NTRIALS print(mean_lsq_error) # error_to_lsq[sketch_method][gamma].append(mean_lsq_error) # error_to_truth[sketch_method][gamma].append(mean_truth_error) error_to_lsq[s] = mean_lsq_error error_to_truth[s] = mean_truth_error pretty = PrettyPrinter(indent=4) pretty.pprint(error_to_lsq) pretty.pprint(error_to_truth) # Save the dictionaries save_dir = '../../output/ihs_baselines//' np.save(save_dir + 'sjlt_error_sparsity_opt', error_to_lsq) np.save(save_dir + 'sjlt_error_sparsity_truth', error_to_truth)
def test_ihs_initialises(all_sketch_methods): '''Checks that the __init__ function is correctly entered and exited from the ihs functions''' X,y = make_regression(1000,2) sketch_dimension = 100 for sketch_method in all_sketch_methods: my_ihs = ihs(X,y,sketch_method,sketch_dimension) assert np.array_equal(my_ihs.A,X) assert np.array_equal(my_ihs.b,y) assert my_ihs.sketch_method == sketch_method assert my_ihs.sketch_dimension == sketch_dimension
def single_exp(_trial, n, d, X, y, sketch_size, sketch_method, run_time, sklearn_lasso_bound): # for _trial in range(trials): print("Trial {}".format(_trial)) shuffled_ids = np.random.permutation(n) X_train, y_train = X[shuffled_ids, :], y[shuffled_ids] my_ihs = ihs(X_train, y_train, sketch_method, sketch_size) x_ihs, error_track = my_ihs.lasso_fit_new_sketch_timing( sklearn_lasso_bound, run_time) iters_used = error_track.shape[1] return x_ihs, iters_used
def error_vs_dimensionality(): dimension = [2**i for i in range(4, 9)] METHODS = sketches + ['Exact', 'Sketch & Solve'] # Output dictionaries error_to_truth = {_: {} for _ in METHODS} for _ in METHODS: for d in dimension: error_to_truth[_][d] = 0 print(error_to_truth) for d in dimension: n = 100 * d print(f'TESTING {n},{d}') ii = dimension.index(d) sampling_rate = 10 num_iterations = 5 for method in METHODS: if method == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 for trial in range(NTRIALS): # Generate the data X, y, x_star = gaussian_design_unconstrained(n, d, 1.0) if method is "Exact": print('Exact method.') x_hat = np.linalg.lstsq(X, y)[0] elif method is "Sketch & Solve": sketch_size = sampling_rate * num_iterations * d print(f"S&S with {sketch_size} sketch size") _sketch = rp(X, sketch_size, 'countSketch', col_sparsity) SA, Sb = _sketch.sketch_data_targets(y) x_hat = np.linalg.lstsq(SA, Sb)[0] else: sketch_size = sampling_rate * d print( f"Using {num_iterations} iterations, sketch_size {sketch_size} and {method}" ) my_ihs = ihs(X, y, method, sketch_size, col_sparsity) x_hat = my_ihs.ols_fit_new_sketch(num_iterations) error = (prediction_error(X, x_star, x_hat))**(0.5) error_to_truth[method][d] += error for _ in METHODS: for d in dimension: error_to_truth[_][d] /= NTRIALS error_to_truth['Dimensions'] = dimension pretty = PrettyPrinter(indent=4) pretty.pprint(error_to_truth) save_dir = '../../output/ihs_baselines/' np.save(save_dir + 'error_vs_dims', error_to_truth)
def test_ols_new_sketch_per_iteration(all_sketch_methods): ''' Test that using IHS and generating a new sketch every iteration yields an approximation close to the true estimator.''' X,y,_ = gaussian_design_unconstrained(2**13,50,variance=2.5) x_opt = np.linalg.lstsq(X,y,rcond=None)[0] # rcond just to suppres warning as per docs for sketch_method in all_sketch_methods: my_ihs = ihs(X,y,sketch_method,500) x_ihs = my_ihs.ols_fit_new_sketch(iterations=20) x_ihs_track, error_track = my_ihs.ols_fit_new_sketch_track_errors(iterations=20) print(sketch_method, np.linalg.norm(x_ihs - x_opt)) print(f'Tracking {sketch_method}, error {np.linalg.norm(x_ihs_track - x_opt)}') assert np.allclose(x_opt,x_ihs) assert np.allclose(x_opt,x_ihs_track)
def test_ols_one_sketch_per_iteration(all_sketch_methods): ''' Test that using IHS and generating *A SINGLE* sketch yields an approximation close to the true estimator. Need a larger sketch compared to the test with a new sketch for every iteration''' X,y,_ = gaussian_design_unconstrained(2**13,50,variance=2.5) x_opt = np.linalg.lstsq(X,y,rcond=None)[0] # rcond just to suppres warning as per docs for sketch_method in all_sketch_methods: my_ihs = ihs(X,y,sketch_method,1000) x_ihs = my_ihs.ols_fit_one_sketch(iterations=50) x_ihs_track, error_track = my_ihs.ols_fit_one_sketch_track_errors(iterations=20) print(sketch_method, np.linalg.norm(x_ihs - x_opt)) print(f'Tracking {sketch_method}, error {np.linalg.norm(x_ihs_track - x_opt)}') #assert np.isclose(x_opt,x_ihs) np.testing.assert_array_almost_equal(x_ihs,x_opt) assert np.allclose(x_opt,x_ihs_track)
def test_lasso_solver_time(all_sketch_methods): ''' Tests that the lasso qp solver gives the same answers as the sklearn linear model. Generate the sklearn solution first, then take then norm and compare. nb. We don't compare to sklearn as there is not a clean matching between the regularising parameters so only check the global and iterative QPs agree. ''' X, y, x_star = gaussian_design_unconstrained(2000, 10, 1.0) n, d = X.shape ell_1_bound = 100.0 # _lambda = 100.0 # lassoModel = Lasso(alpha=1.0 ,max_iter=1000) # sklearn_X, sklearn_y = np.sqrt(n)*X, np.sqrt(n)*y # lassoModel.fit(sklearn_X, sklearn_y) # x_opt = lassoModel.coef_ x_opt = lasso_solver(X, y, ell_1_bound) x0 = np.zeros((d, )) for sketch_method in all_sketch_methods: my_ihs = ihs(X, y, sketch_method, 500) x_ihs_track, error_track = my_ihs.lasso_fit_new_sketch_timing( ell_1_bound, 1.5) final_sol_error = (1 / n) * np.linalg.norm( X @ (x_ihs_track - x_opt))**2 print( f'Tracking {sketch_method}, error {np.linalg.norm(x_ihs_track - x_opt)}' ) print("log Error to opt: {}".format(np.log(final_sol_error))) print(f"{error_track.shape[1]} iterations completed") print(np.c_[x_opt, x_ihs_track]) assert np.allclose(x_opt, x_ihs_track, 1E-1)
def error_vs_iterations(): n = 6_000 d = 200 gamma_vals = [5] number_iterations = 30 # Output dictionaries indexed by: # sketch method (sketches) --> sketch size (gamma_vals) --> STEPSIZE error_to_lsq = {sketch_name: {} for sketch_name in sketches} error_to_truth = {sketch_name: {} for sketch_name in sketches} for sketch_name in sketches: for gamma in gamma_vals: error_to_lsq[sketch_name][gamma] = {} error_to_truth[sketch_name][gamma] = {} for step in STEPSIZE: error_to_lsq[sketch_name][gamma][step] = [] error_to_truth[sketch_name][gamma][step] = [] X, y, x_star = gaussian_design_unconstrained(n, d, variance=1.0) # # Least squares estimator x_opt = np.linalg.lstsq(X, y)[0] print('-' * 80) print("Beginning test") lsq_vs_truth_errors = np.log(np.sqrt(prediction_error(X, x_opt, x_star))) print(lsq_vs_truth_errors) for gamma in gamma_vals: sketch_size = int(gamma * d) print("Testing gamma: {}, num_iterations: {}".format( gamma, number_iterations)) for sketch_method in sketches: #lsq_error, truth_error = 0,0 lsq_error = np.zeros((number_iterations, )) truth_error = np.zeros_like(lsq_error) if sketch_method == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 my_ihs = ihs(X, y, sketch_method, sketch_size, col_sparsity) for step in STEPSIZE: lsq_error = np.zeros((number_iterations, )) for trial in range(NTRIALS): print('*' * 80) print("{}, trial: {}".format(sketch_method, trial)) print('Step size: ', step) x_ihs, x_iters = my_ihs.ols_fit_one_sketch_track_errors( number_iterations, step) for _ in range(x_iters.shape[1]): residual = prediction_error(X, x_iters[:, _], x_opt) print('Trial {}, residual {}'.format(_, residual)) lsq_error[_] += residual # Sketching Error for this step size. frob_error = my_ihs.frob_error spec_error = my_ihs.spectral_error print('Frobenius error: ', frob_error) print('Spectral error: ', spec_error) mean_lsq_error = lsq_error / NTRIALS error_to_lsq[sketch_method][gamma][step] = mean_lsq_error pretty = PrettyPrinter(indent=4) pretty.pprint(error_to_lsq) ### PLOTTING ### my_markers = ['.', 's', '^', 'D', '*', 'h'] my_colours = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5'] fig, ax = plt.subplots() x_vals = range(1, number_iterations + 1) for gamma in gamma_vals: for sketch_method in sketches: for i, step in enumerate(STEPSIZE): _marker = my_markers[i] _colour = my_colours[i] residual = error_to_lsq[sketch_method][gamma][step] ax.plot(x_vals, residual, label=step, marker=_marker, color=_colour) ax.set_yscale('log') ax.set_xticks(x_vals[1::2]) ax.set_xlabel("Iterations") ax.set_ylabel('$\| x^t - x_{\t{opt}}\|_A^2$') ax.legend(title='Step sizes' ) # nb this only makes sense for one sketch dimension ax.set_title('{}, m={}d, step size varied'.format(sketches[0], gamma)) plt.show()
def error_vs_time_real_data(data_name,X,y,penalty,sampling_factors,trials,times,x_opt): '''Show that a random lasso instance is approximated by the hessian sketching scheme''' # Experimental setup print(80*"-") print("Testing dataset: {}".format(data_name)) print("TESTING LASSO ITERATIVE HESSIAN SKETCH ALGORITHM") times2test = times n,d = X.shape print("Is x_OPT all zeros? {}".format(x_opt == np.zeros_like(x_opt))) time_results = {} sparse_data = sparse.csr_matrix(X) for sketch in sketches: time_results[sketch] = {} for gamma in sampling_factors: time_results[sketch][gamma] = {} for sketch_method in sketches: for gamma in sampling_factors: solution_error_for_iter_check = 1.0 # to check whether the error is small # enough to break out of the loop. for time_ in times2test: #for time_ in range(times): print("-"*80) print("Testing time: {}".format(time_)) print("int-log-error: {}".format(np.int(solution_error_for_iter_check))) if np.int(solution_error_for_iter_check) <= -16: # continuing for longer doesn't gain anything so just use # previous results. time_results[sketch_method][gamma][time_] = {"error to opt" : total_error2opt, "solution error" : total_sol_error, "num iterations" : total_iters_used} print("Already converged before time {} seconds so continuing.".format(time_)) else: # total_error2opt = 0 # total_error2truth = 0 # total_sol_error = 0 # total_objective_error = 0 # total_iters_used = 0 total_error2opt = [] total_sol_error = [] total_objective_error = [] total_iters_used = [] print("IHS-LASSO ALGORITHM on ({},{}) WITH {}, gamma {}".format(n,d,sketch_method, gamma)) for _trial in range(trials): print("Trial {}".format(_trial)) shuffled_ids = np.random.permutation(n) X_train, y_train = X[shuffled_ids,:], y[shuffled_ids] sparse_X_train = sparse_data[shuffled_ids,:] sparse_X_train = sparse_X_train.tocoo() rows, cols, vals = sparse_X_train.row, sparse_X_train.col, sparse_X_train.data my_ihs = ihs(X,y,sketch_method,np.int(gamma*d)) x_ihs, iters_used = my_ihs.lasso_fit_new_sketch_timing(penalty,time_) my_prediction_error = prediction_error(X,x_opt,x_ihs) print("Iterations completed: ", iters_used) print("Prediction error: ",my_prediction_error) #print("||x^OPT - x_hat||_A^2: {}".format((np.log(my_prediction_error/n)))) # Update dict output values error2opt = my_prediction_error solution_error = (1/n)*np.linalg.norm(x_ihs - x_opt)**2 print("Trial: {}, Error: {}".format(_trial, error2opt)) print("-"*80) # Update counts # total_error2opt += error2opt # total_sol_error += solution_error # total_iters_used += iters_used total_error2opt.append(error2opt) total_sol_error.append(solution_error) total_iters_used.append(iters_used) total_error2opt = np.median(total_error2opt) total_sol_error = np.median(total_sol_error) total_iters_used = np.median(total_iters_used) print("Mean log||x^* - x'||_A^2: {}".format(np.log10(total_error2opt))) print("Mean log||x^* - x'||^2: {}".format(total_sol_error)) print("Mean number of {} iterations used".format(total_iters_used)) time_results[sketch_method][gamma][time_] = {"error to opt" : total_error2opt, "solution error" : total_sol_error, "num iterations" : total_iters_used} # Bookkeeping - if the error is at 10E-16 don't do another iteration. solution_error_for_iter_check = np.log10(total_error2opt) print("New sol_error_iters: {}".format(solution_error_for_iter_check)) # pretty = PrettyPrinter(indent=4) pretty.pprint(time_results) return time_results
def solution_error_vs_row_dim(): ''' Increase `n` the input dimension of the problem and measure the solution error in both: (i) Euclidean norm (`mean_square_error`) (ii) Prediction norm (`prediction_error`). Error measurements are taken with respect to: (i) the optimal solution x_opt (ii) the ground truth ''' print('Experimental setup:') print(f'IHS sketch size {SKETCH_SIZE}') print(f'Sketch and solve sketch size {CLASSICAL_SKETCH_SIZE}') print(f'Number of rounds {ROUNDS}') # Output dictionaries MSE_OPT = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } PRED_ERROR_OPT = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } MSE_TRUTH = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } PRED_ERROR_TRUTH = { sketches[i]: np.zeros(len(ROWDIMS), ) for i in range(len(sketches)) } MSE_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) MSE_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), ) MSE_TRUTH['Exact'] = np.zeros(len(ROWDIMS), ) PRED_ERROR_TRUTH['Exact'] = np.zeros(len(ROWDIMS), ) ## Experiment for n in ROWDIMS: print(f'Testing {n} rows') experiment_index = ROWDIMS.index(n) _iters = ROUNDS[experiment_index] ihs_sketch_size = SKETCH_SIZE classic_sketch_size = CLASSICAL_SKETCH_SIZE[experiment_index] for trial in range(NTRIALS): print("TRIAL {}".format(trial)) X, y, x_true = gaussian_design_unconstrained(n, D, variance=1.0) x_opt = np.linalg.lstsq(X, y)[0] for sketch_method in METHODS: print('*' * 80) if sketch_method in sketches or sketch_method == 'Sketch & Solve': if sketch_method == 'sjlt': col_sparsity = 4 else: col_sparsity = 1 if sketch_method == 'Sketch & Solve': _sketch = rp(X, classic_sketch_size, 'countSketch', col_sparsity) SA, Sb = _sketch.sketch_data_targets(y) x_ss = np.linalg.lstsq(SA, Sb)[0] MSE_OPT[sketch_method][ experiment_index] += mean_square_error( x_opt, x_ss) PRED_ERROR_OPT[sketch_method][ experiment_index] += prediction_error( X, x_opt, x_ss) MSE_TRUTH[sketch_method][ experiment_index] += mean_square_error( x_true, x_ss) PRED_ERROR_TRUTH[sketch_method][ experiment_index] += prediction_error( X, x_true, x_ss) else: print(f'{sketch_method} IHS') my_ihs = ihs(X, y, sketch_method, ihs_sketch_size, col_sparsity) x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors( _iters) x_errors = x_opt[:, None] - x_iters print(x_errors.shape) MSE_OPT[sketch_method][ experiment_index] += mean_square_error( x_opt, x_ihs) PRED_ERROR_OPT[sketch_method][ experiment_index] += prediction_error( X, x_opt, x_ihs) MSE_TRUTH[sketch_method][ experiment_index] += mean_square_error( x_true, x_ihs) PRED_ERROR_TRUTH[sketch_method][ experiment_index] += prediction_error( X, x_true, x_ihs) else: # solve exactly #x_opt = np.linalg.lstsq(X,y)[0] MSE_TRUTH["Exact"][experiment_index] += mean_square_error( x_opt, x_true) PRED_ERROR_TRUTH["Exact"][ experiment_index] += prediction_error( X, x_opt, x_true) for _dict in [MSE_OPT, PRED_ERROR_OPT, MSE_TRUTH, PRED_ERROR_TRUTH]: for _key in _dict.keys(): _dict[_key] /= NTRIALS pretty = PrettyPrinter(indent=4) pretty.pprint(MSE_OPT) pretty.pprint(PRED_ERROR_OPT) pretty.pprint(MSE_TRUTH) pretty.pprint(PRED_ERROR_TRUTH) save_dir = '../../output/baselines/' np.save(save_dir + 'ihs_ols_mse_OPT', MSE_OPT) np.save(save_dir + 'ihs_ols_pred_error_OPT', PRED_ERROR_OPT) np.save(save_dir + 'ihs_ols_mse_TRUTH', MSE_TRUTH) np.save(save_dir + 'ihs_ols_pred_error_TRUTH', PRED_ERROR_TRUTH)