Esempio n. 1
0
def sjlt_error_vs_iterations():
    n = 6_000
    d = 200
    gamma_vals = [5]  #[4,6,8]
    sketch_size = int(gamma_vals[0] * d)
    col_sparsities = [1, 4, 16]
    number_iterations = 20  # 40 #np.asarray(np.linspace(5,40,8), dtype=np.int)
    # Output dictionaries
    error_to_lsq = {}  #{sketch_name : {} for sketch_name in sketches}
    error_to_truth = {}  #{sketch_name : {} for sketch_name in sketches}
    for s in col_sparsities:
        error_to_lsq[s] = []
        error_to_truth[s] = []
    print(error_to_lsq)
    print(error_to_truth)

    X, y, x_star = gaussian_design_unconstrained(n, d, variance=1.0)

    # Least squares estimator
    x_opt = np.linalg.lstsq(X, y)[0]
    lsq_vs_truth_errors = np.log(np.sqrt(prediction_error(X, x_opt, x_star)))

    for s in col_sparsities:
        col_sparsity = s
        print("Testing col sparsity: {}, num_iterations: {}".format(
            col_sparsity, number_iterations))
        for sketch_method in sketches:
            #lsq_error, truth_error = 0,0
            lsq_error = np.zeros((number_iterations, ))
            truth_error = np.zeros_like(lsq_error)

            my_ihs = ihs(X, y, sketch_method, sketch_size, col_sparsity)
            for trial in range(NTRIALS):
                print('*' * 80)
                print("{}, trial: {}".format(sketch_method, trial))
                x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors(
                    number_iterations)
                for _ in range(x_iters.shape[1]):
                    lsq_error[_] += prediction_error(X, x_iters[:, _], x_opt)
                    truth_error[_] += prediction_error(X, x_iters[:, _],
                                                       x_star)
                print(lsq_error)
                # lsq_error += prediction_error(X,x_ihs, x_opt)
                # truth_error += prediction_error(X,x_ihs, x_star)
            mean_lsq_error = lsq_error / NTRIALS
            mean_truth_error = truth_error / NTRIALS
            print(mean_lsq_error)
            # error_to_lsq[sketch_method][gamma].append(mean_lsq_error)
            # error_to_truth[sketch_method][gamma].append(mean_truth_error)
            error_to_lsq[s] = mean_lsq_error
            error_to_truth[s] = mean_truth_error
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(error_to_lsq)
    pretty.pprint(error_to_truth)

    # Save the dictionaries
    save_dir = '../../output/ihs_baselines//'
    np.save(save_dir + 'sjlt_error_sparsity_opt', error_to_lsq)
    np.save(save_dir + 'sjlt_error_sparsity_truth', error_to_truth)
def error_vs_dimensionality():
    dimension = [2**i for i in range(4, 9)]
    METHODS = sketches + ['Exact', 'Sketch & Solve']

    # Output dictionaries
    error_to_truth = {_: {} for _ in METHODS}
    for _ in METHODS:
        for d in dimension:
            error_to_truth[_][d] = 0
    print(error_to_truth)

    for d in dimension:
        n = 100 * d
        print(f'TESTING {n},{d}')
        ii = dimension.index(d)
        sampling_rate = 10
        num_iterations = 5
        for method in METHODS:
            if method == 'sjlt':
                col_sparsity = 4
            else:
                col_sparsity = 1
            for trial in range(NTRIALS):
                # Generate the data
                X, y, x_star = gaussian_design_unconstrained(n, d, 1.0)
                if method is "Exact":
                    print('Exact method.')
                    x_hat = np.linalg.lstsq(X, y)[0]

                elif method is "Sketch & Solve":
                    sketch_size = sampling_rate * num_iterations * d
                    print(f"S&S with {sketch_size} sketch size")
                    _sketch = rp(X, sketch_size, 'countSketch', col_sparsity)
                    SA, Sb = _sketch.sketch_data_targets(y)
                    x_hat = np.linalg.lstsq(SA, Sb)[0]
                else:
                    sketch_size = sampling_rate * d
                    print(
                        f"Using {num_iterations} iterations, sketch_size {sketch_size} and {method}"
                    )
                    my_ihs = ihs(X, y, method, sketch_size, col_sparsity)
                    x_hat = my_ihs.ols_fit_new_sketch(num_iterations)

                error = (prediction_error(X, x_star, x_hat))**(0.5)
                error_to_truth[method][d] += error
    for _ in METHODS:
        for d in dimension:
            error_to_truth[_][d] /= NTRIALS
    error_to_truth['Dimensions'] = dimension
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(error_to_truth)
    save_dir = '../../output/ihs_baselines/'
    np.save(save_dir + 'error_vs_dims', error_to_truth)
def error_vs_time(n, d, sampling_factors, trials, times2test,
                  sklearn_lasso_bound):
    '''Show that a random lasso instance is approximated by the
    hessian sketching scheme'''
    print(80 * "-")
    print("TESTING LASSO ITERATIVE HESSIAN SKETCH ALGORITHM")

    print("Generating  data")
    X, y, x_star = my_lasso_data(n, d)
    #X = normalize(X)

    ### Test Sklearn implementation
    print("Beginning test")
    x_opt, f_opt, sklearn_time = sklearn_wrapper(X, y, n, d,
                                                 sklearn_lasso_bound, trials)
    print("LASSO-skl time: {}".format(sklearn_time))
    # ground Truths
    sklearn_error2truth = prediction_error(X, x_opt, x_star)
    time_results = {
        "Sklearn": {
            "error to truth": sklearn_error2truth,
            "objective": f_opt,
            "solve time": sklearn_time
        },
    }

    for sketch in sketches:
        time_results[sketch] = {}
        for gamma in sampling_factors:
            time_results[sketch][gamma] = {}

    for sketch_method in sketches:

        for gamma in sampling_factors:
            sketch_size = np.int(gamma * d)

            euclidean_error_for_iter_check = 1.0  # to check whether the error is small
            # enough to break out of the loop.
            for time in times2test:
                print("-" * 80)
                print("Testing time: {}".format(time))
                print("int-log-error: {}".format(
                    np.int(euclidean_error_for_iter_check)))
                if np.int(euclidean_error_for_iter_check) <= -16:
                    # continuing for longer doesn't gain anything so just use
                    # previous results.
                    time_results[sketch_method][gamma][time] = {
                        "error to opt": total_error2opt,
                        "solution error": total_sol_error,
                        "num iterations": total_iters_used
                    }
                    print(
                        "Already converged before time {} seconds so continuing."
                        .format(time))
                else:
                    total_error2opt = 0
                    total_error2truth = 0
                    total_sol_error = 0
                    total_objective_error = 0
                    total_iters_used = 0
                    print("IHS-LASSO ALGORITHM on ({},{}) WITH {}, gamma {}".
                          format(n, d, sketch_method, gamma))
                    results = Parallel(n_jobs=-1,prefer="threads")(delayed(single_exp)\
                                    (_trial,n,d,X,y,sketch_size, sketch_method,time,sklearn_lasso_bound) for _trial in range(trials))
                    for i in range(trials):
                        x_ihs = results[i][0]
                        total_iters_used += results[i][
                            1]  #np.abs(results[i][0])

                        # Update dict output values
                        error2opt = prediction_error(X, x_opt, x_ihs)**2
                        euclidean_error = (1 / n) * np.linalg.norm(x_ihs -
                                                                   x_opt)**2

                        # Update counts
                        total_error2opt += error2opt
                        total_sol_error += euclidean_error

                    total_error2opt /= trials
                    total_sol_error /= trials
                    total_iters_used /= trials
                    print("Mean log||x^* - x'||_A^2: {}".format(
                        np.log10(total_error2opt)))
                    print("Mean log||x^* - x'||^2: {}".format(total_sol_error))
                    print("Mean number of {} iterations used".format(
                        total_iters_used))
                    time_results[sketch_method][gamma][time] = {
                        "error to opt": total_error2opt,
                        "solution error": total_sol_error,
                        "num iterations": total_iters_used
                    }
                    # Bookkeeping - if the error is at 10E-16 don't do another iteration.
                    euclidean_error_for_iter_check = np.log10(total_error2opt)
                    print("New sol_error_iters: {}".format(
                        euclidean_error_for_iter_check))
        pretty = PrettyPrinter(indent=4)
        pretty.pprint(time_results)
        file_name = '../../output/ihs_timings/ihs_time_synthetic' + str(
            n) + '_' + str(d) + '.npy'
        np.save(file_name, time_results)
        pass
def error_vs_iterations():
    n = 6_000
    d = 200
    gamma_vals = [5]
    number_iterations = 30

    # Output dictionaries indexed by:
    # sketch method (sketches) --> sketch size (gamma_vals) --> STEPSIZE
    error_to_lsq = {sketch_name: {} for sketch_name in sketches}
    error_to_truth = {sketch_name: {} for sketch_name in sketches}
    for sketch_name in sketches:
        for gamma in gamma_vals:
            error_to_lsq[sketch_name][gamma] = {}
            error_to_truth[sketch_name][gamma] = {}
            for step in STEPSIZE:
                error_to_lsq[sketch_name][gamma][step] = []
                error_to_truth[sketch_name][gamma][step] = []

    X, y, x_star = gaussian_design_unconstrained(n, d, variance=1.0)

    # # Least squares estimator
    x_opt = np.linalg.lstsq(X, y)[0]
    print('-' * 80)
    print("Beginning test")
    lsq_vs_truth_errors = np.log(np.sqrt(prediction_error(X, x_opt, x_star)))
    print(lsq_vs_truth_errors)

    for gamma in gamma_vals:
        sketch_size = int(gamma * d)
        print("Testing gamma: {}, num_iterations: {}".format(
            gamma, number_iterations))
        for sketch_method in sketches:
            #lsq_error, truth_error = 0,0
            lsq_error = np.zeros((number_iterations, ))
            truth_error = np.zeros_like(lsq_error)
            if sketch_method == 'sjlt':
                col_sparsity = 4
            else:
                col_sparsity = 1

            my_ihs = ihs(X, y, sketch_method, sketch_size, col_sparsity)
            for step in STEPSIZE:
                lsq_error = np.zeros((number_iterations, ))
                for trial in range(NTRIALS):
                    print('*' * 80)
                    print("{}, trial: {}".format(sketch_method, trial))
                    print('Step size: ', step)
                    x_ihs, x_iters = my_ihs.ols_fit_one_sketch_track_errors(
                        number_iterations, step)
                    for _ in range(x_iters.shape[1]):
                        residual = prediction_error(X, x_iters[:, _], x_opt)
                        print('Trial {}, residual {}'.format(_, residual))
                        lsq_error[_] += residual

                    # Sketching Error for this step size.
                    frob_error = my_ihs.frob_error
                    spec_error = my_ihs.spectral_error
                    print('Frobenius error: ', frob_error)
                    print('Spectral error: ', spec_error)
                mean_lsq_error = lsq_error / NTRIALS
                error_to_lsq[sketch_method][gamma][step] = mean_lsq_error
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(error_to_lsq)

    ### PLOTTING ###
    my_markers = ['.', 's', '^', 'D', '*', 'h']
    my_colours = ['C0', 'C1', 'C2', 'C3', 'C4', 'C5']
    fig, ax = plt.subplots()
    x_vals = range(1, number_iterations + 1)
    for gamma in gamma_vals:
        for sketch_method in sketches:
            for i, step in enumerate(STEPSIZE):
                _marker = my_markers[i]
                _colour = my_colours[i]
                residual = error_to_lsq[sketch_method][gamma][step]
                ax.plot(x_vals,
                        residual,
                        label=step,
                        marker=_marker,
                        color=_colour)
    ax.set_yscale('log')
    ax.set_xticks(x_vals[1::2])
    ax.set_xlabel("Iterations")
    ax.set_ylabel('$\| x^t - x_{\t{opt}}\|_A^2$')
    ax.legend(title='Step sizes'
              )  # nb this only makes sense for one sketch dimension
    ax.set_title('{}, m={}d, step size varied'.format(sketches[0], gamma))
    plt.show()
def error_vs_time_real_data(data_name,X,y,penalty,sampling_factors,trials,times,x_opt):
    '''Show that a random lasso instance is approximated by the
    hessian sketching scheme'''


    # Experimental setup
    print(80*"-")
    print("Testing dataset: {}".format(data_name))
    print("TESTING LASSO ITERATIVE HESSIAN SKETCH ALGORITHM")
    times2test = times
    n,d = X.shape
    print("Is x_OPT all zeros? {}".format(x_opt == np.zeros_like(x_opt)))
    time_results = {}

    sparse_data = sparse.csr_matrix(X)

    for sketch in sketches:
        time_results[sketch] = {}
        for gamma in sampling_factors:
            time_results[sketch][gamma] = {}

    for sketch_method in sketches:
        for gamma in sampling_factors:

            solution_error_for_iter_check = 1.0  # to check whether the error is small
                                                 # enough to break out of the loop.

            for time_ in times2test:
            #for time_ in range(times):
                print("-"*80)
                print("Testing time: {}".format(time_))
                print("int-log-error: {}".format(np.int(solution_error_for_iter_check)))
                if np.int(solution_error_for_iter_check) <= -16:
                    # continuing for longer doesn't gain anything so just use
                    # previous results.
                    time_results[sketch_method][gamma][time_] = {"error to opt" : total_error2opt,
                                                         "solution error" : total_sol_error,
                                                         "num iterations" : total_iters_used}
                    print("Already converged before time {} seconds so continuing.".format(time_))

                else:
                    # total_error2opt       = 0
                    # total_error2truth     = 0
                    # total_sol_error       = 0
                    # total_objective_error = 0
                    # total_iters_used      = 0
                    total_error2opt       = []
                    total_sol_error       = []
                    total_objective_error = []
                    total_iters_used      = []

                    print("IHS-LASSO ALGORITHM on ({},{}) WITH {}, gamma {}".format(n,d,sketch_method, gamma))

                    for _trial in range(trials):
                        print("Trial {}".format(_trial))
                        shuffled_ids = np.random.permutation(n)
                        X_train, y_train = X[shuffled_ids,:], y[shuffled_ids]
                        sparse_X_train = sparse_data[shuffled_ids,:]
                        sparse_X_train = sparse_X_train.tocoo()
                        rows, cols, vals = sparse_X_train.row, sparse_X_train.col, sparse_X_train.data

                        my_ihs = ihs(X,y,sketch_method,np.int(gamma*d))
                        x_ihs, iters_used = my_ihs.lasso_fit_new_sketch_timing(penalty,time_)
                        my_prediction_error = prediction_error(X,x_opt,x_ihs)
                        print("Iterations completed: ", iters_used)
                        print("Prediction error: ",my_prediction_error)



                        #print("||x^OPT - x_hat||_A^2: {}".format((np.log(my_prediction_error/n))))

                        # Update dict output values
                        error2opt = my_prediction_error
                        solution_error = (1/n)*np.linalg.norm(x_ihs - x_opt)**2
                        print("Trial: {}, Error: {}".format(_trial, error2opt))
                        print("-"*80)
                        # Update counts
                        # total_error2opt  += error2opt
                        # total_sol_error  += solution_error
                        # total_iters_used += iters_used
                        total_error2opt.append(error2opt)
                        total_sol_error.append(solution_error)
                        total_iters_used.append(iters_used)

                    total_error2opt = np.median(total_error2opt)
                    total_sol_error = np.median(total_sol_error)
                    total_iters_used = np.median(total_iters_used)
                    print("Mean log||x^* - x'||_A^2: {}".format(np.log10(total_error2opt)))
                    print("Mean log||x^* - x'||^2: {}".format(total_sol_error))
                    print("Mean number of {} iterations used".format(total_iters_used))
                    time_results[sketch_method][gamma][time_] = {"error to opt" : total_error2opt,
                                                         "solution error" : total_sol_error,
                                                         "num iterations" : total_iters_used}
                    # Bookkeeping - if the error is at 10E-16 don't do another iteration.
                    solution_error_for_iter_check = np.log10(total_error2opt)
                    print("New sol_error_iters: {}".format(solution_error_for_iter_check))
    #
    pretty = PrettyPrinter(indent=4)
    pretty.pprint(time_results)
    return time_results
Esempio n. 6
0
def solution_error_vs_row_dim():
    '''
    Increase `n` the input dimension of the problem and
    measure the solution error in both:
    (i) Euclidean norm (`mean_square_error`)
    (ii) Prediction norm (`prediction_error`).

    Error measurements are taken with respect to:
    (i) the optimal solution x_opt
    (ii) the ground truth

    '''
    print('Experimental setup:')
    print(f'IHS sketch size {SKETCH_SIZE}')
    print(f'Sketch and solve sketch size {CLASSICAL_SKETCH_SIZE}')
    print(f'Number of rounds {ROUNDS}')

    # Output dictionaries
    MSE_OPT = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    PRED_ERROR_OPT = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    MSE_TRUTH = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }
    PRED_ERROR_TRUTH = {
        sketches[i]: np.zeros(len(ROWDIMS), )
        for i in range(len(sketches))
    }

    MSE_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_OPT['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    MSE_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_TRUTH['Sketch & Solve'] = np.zeros(len(ROWDIMS), )

    MSE_TRUTH['Exact'] = np.zeros(len(ROWDIMS), )
    PRED_ERROR_TRUTH['Exact'] = np.zeros(len(ROWDIMS), )

    ## Experiment
    for n in ROWDIMS:
        print(f'Testing {n} rows')
        experiment_index = ROWDIMS.index(n)
        _iters = ROUNDS[experiment_index]
        ihs_sketch_size = SKETCH_SIZE
        classic_sketch_size = CLASSICAL_SKETCH_SIZE[experiment_index]

        for trial in range(NTRIALS):
            print("TRIAL {}".format(trial))
            X, y, x_true = gaussian_design_unconstrained(n, D, variance=1.0)
            x_opt = np.linalg.lstsq(X, y)[0]

            for sketch_method in METHODS:
                print('*' * 80)
                if sketch_method in sketches or sketch_method == 'Sketch & Solve':
                    if sketch_method == 'sjlt':
                        col_sparsity = 4
                    else:
                        col_sparsity = 1

                    if sketch_method == 'Sketch & Solve':
                        _sketch = rp(X, classic_sketch_size, 'countSketch',
                                     col_sparsity)
                        SA, Sb = _sketch.sketch_data_targets(y)
                        x_ss = np.linalg.lstsq(SA, Sb)[0]
                        MSE_OPT[sketch_method][
                            experiment_index] += mean_square_error(
                                x_opt, x_ss)
                        PRED_ERROR_OPT[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_opt, x_ss)
                        MSE_TRUTH[sketch_method][
                            experiment_index] += mean_square_error(
                                x_true, x_ss)
                        PRED_ERROR_TRUTH[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_true, x_ss)
                    else:
                        print(f'{sketch_method} IHS')
                        my_ihs = ihs(X, y, sketch_method, ihs_sketch_size,
                                     col_sparsity)
                        x_ihs, x_iters = my_ihs.ols_fit_new_sketch_track_errors(
                            _iters)
                        x_errors = x_opt[:, None] - x_iters
                        print(x_errors.shape)
                        MSE_OPT[sketch_method][
                            experiment_index] += mean_square_error(
                                x_opt, x_ihs)
                        PRED_ERROR_OPT[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_opt, x_ihs)
                        MSE_TRUTH[sketch_method][
                            experiment_index] += mean_square_error(
                                x_true, x_ihs)
                        PRED_ERROR_TRUTH[sketch_method][
                            experiment_index] += prediction_error(
                                X, x_true, x_ihs)
                else:
                    # solve exactly
                    #x_opt = np.linalg.lstsq(X,y)[0]
                    MSE_TRUTH["Exact"][experiment_index] += mean_square_error(
                        x_opt, x_true)
                    PRED_ERROR_TRUTH["Exact"][
                        experiment_index] += prediction_error(
                            X, x_opt, x_true)

    for _dict in [MSE_OPT, PRED_ERROR_OPT, MSE_TRUTH, PRED_ERROR_TRUTH]:
        for _key in _dict.keys():
            _dict[_key] /= NTRIALS

    pretty = PrettyPrinter(indent=4)
    pretty.pprint(MSE_OPT)
    pretty.pprint(PRED_ERROR_OPT)
    pretty.pprint(MSE_TRUTH)
    pretty.pprint(PRED_ERROR_TRUTH)

    save_dir = '../../output/baselines/'
    np.save(save_dir + 'ihs_ols_mse_OPT', MSE_OPT)
    np.save(save_dir + 'ihs_ols_pred_error_OPT', PRED_ERROR_OPT)
    np.save(save_dir + 'ihs_ols_mse_TRUTH', MSE_TRUTH)
    np.save(save_dir + 'ihs_ols_pred_error_TRUTH', PRED_ERROR_TRUTH)