Ejemplo n.º 1
0
def run_finer_lrs(init_param='kaiming', device='cpu'):
    dist_grid = [ExampleDistribution()
                 ] + [RadialDataDistribution(d=2**k) for k in range(7)]
    std_grid = [0.1, 0.5, 1.0, 2.0]
    # bi_grid = [('zero', 0.0), ('he+5', 0.0), ('he+1', 0.0), ('kink_uniform', 0.0)] \
    #             + [(bim, big) for big in std_grid for bim in ['normal', 'uniform']] \
    #             + [('pos-unif', 1.0), ('neg-unif', 1.0), ('kink-unif', 1.0), ('kink-neg-unif', 1.0),
    #                ('kink-neg-point', 0.0)]
    bi_grid = [('zero', 0.0), ('unif', 1.0), ('unif-pos', 1.0),
               ('unif-neg', 1.0), ('kink-neg-unif', 1.0), ('pytorch', 1.0),
               ('kink-neg-point', 0.0)]
    for opt in ['gd', 'gd-mom', 'adam']:
        for dist in dist_grid:
            d = dist.get_x_dim()
            for bim, big in bi_grid:
                folder_name = f'{init_param}_{opt}_{dist.get_name()}_{bim}-{big:g}'
                path = Path(custom_paths.get_results_path()
                            ) / 'nn_comparison' / folder_name
                best_lr_file = Path(custom_paths.get_results_path(
                )) / 'nn_comparison' / f'{folder_name}_bestlr.pkl'
                if not utils.existsFile(best_lr_file):
                    sys.stderr.write(
                        'best lr file {best_lr_file} does not exist!\n')
                    continue
                best_lr = utils.deserialize(best_lr_file)
                lr_grid = [best_lr * (2**(k / 8)) for k in range(-3, 4)]
                for lr in lr_grid:
                    print(f'Running combination {folder_name} with lr {lr:g}')
                    file = path / f'{lr:g}.pkl'
                    utils.ensureDir(file)
                    if utils.existsFile(file):
                        continue
                    n_rep = 2 if d == 64 else 1
                    trainer = SimpleParallelTrainer(n_parallel=100 // n_rep,
                                                    n_train=256 * d,
                                                    n_valid=1024,
                                                    n_test=1024,
                                                    data_distribution=dist,
                                                    lr=lr,
                                                    bias_init_gain=big,
                                                    batch_size=256,
                                                    bias_init_mode=bim,
                                                    init_param=init_param,
                                                    n_epochs=8192 // d,
                                                    seed=0,
                                                    device=device,
                                                    n_hidden=512,
                                                    opt=opt,
                                                    valid_epoch_interval=64 //
                                                    d,
                                                    n_rep=n_rep)
                    results = trainer.fit(do_plot=False, verbose=False)
                    if results is None:
                        print('Got NaN values')
                    utils.serialize(file, {
                        'trainer': trainer,
                        'results': results
                    })
def run_training(n_hidden=256, ds_type='2d_star_11', n_parallel=1000, n_epochs=1000000,
                 random_bias=False, act='relu', n_layers=1, device_number=0, version=0):
    print('Start time:', datetime.datetime.now())

    if n_layers > 1:
        name = f'{n_hidden}x{n_layers}-{n_parallel}-{random_bias}-{act}-v{version}'
    else:
        name = f'{n_hidden}-{n_parallel}-{random_bias}-{act}-v{version}'

    x_train, y_train = get_2d_star_dataset(k=11, dist=0.1)

    print(f'Running model for {n_epochs} epochs on dataset {ds_type}: {name}')
    base_dir = Path(get_results_path())
    file_dir = base_dir/ds_type/name
    file_path = file_dir/'model_trainer.p'
    if utils.existsFile(file_path):
        print('Loading existing model')
        mt = utils.deserialize(file_path)
        mt.to(get_device(device_number))
    else:
        print('Creating new model')
        mt = ModelTrainer(x_train, y_train, n_parallel=n_parallel,
                          hidden_sizes=[n_hidden] * n_layers, n_virtual_samples=n_hidden**2,
                          random_bias=random_bias, act=act, device_number=device_number, version=version)
    mt.train(n_epochs)
    mt.to('cpu')
    utils.serialize(file_path, mt)
    utils.serialize(file_dir/'config.p', dict(ds_type=ds_type, n_parallel=n_parallel, n_layers=n_layers,
                                              random_bias=random_bias, act=act, n_epochs=n_epochs, version=version))
    print('Saved trained model')
    print('End time:', datetime.datetime.now())
def compute_dd_results(name, sampler, n_rep=10, n_parallel=1000, **kwargs):
    # computes the results in multiple repetitions and saves them
    # but only if the results are not already computed
    for rep in range(n_rep):
        print(f'Repetition {rep+1}/{n_rep}')
        filename = Path(
            'data/double_descent/') / name / f'v{rep}_{n_parallel}.p'
        if utils.existsFile(filename):
            print('Results have already been computed')
            continue
        results = DoubleDescentResults(**kwargs,
                                       random_seed=rep,
                                       n_parallel=n_parallel)
        results.compute(sampler)
        utils.serialize(filename, results)
Ejemplo n.º 4
0
def run_old(init_param='kaiming', device='cpu'):
    dist_grid = [ExampleDistribution()
                 ] + [RBFDataDistribution(d=2**k) for k in range(7)]
    std_grid = [0.1, 0.5, 1.0, 2.0]
    bi_grid = [('zero', 0.0), ('he+5', 0.0), ('he+1', 0.0), ('kink_uniform', 0.0)] \
                + [(bim, big) for big in std_grid for bim in ['normal', 'uniform']]
    for opt in ['gd', 'gd-mom', 'adam']:
        base_lr = 1e-2 if opt == 'adam' else (
            4e-1 if init_param == 'ntk' else 8e-3)
        lr_grid = [base_lr * np.sqrt(2)**k for k in range(-8, 9)]
        for dist in dist_grid:
            for bim, big in bi_grid:
                folder_name = f'{init_param}_{opt}_{dist.get_name()}_{bim}-{big:g}'
                path = Path(custom_paths.get_results_path()
                            ) / 'nn_comparison' / folder_name
                for lr in lr_grid:
                    print(f'Running combination {folder_name} with lr {lr:g}')
                    file = path / f'{lr:g}.pkl'
                    utils.ensureDir(file)
                    if utils.existsFile(file):
                        continue
                    torch.cuda.empty_cache()
                    trainer = SimpleParallelTrainer(n_parallel=100,
                                                    n_train=256,
                                                    n_valid=1024,
                                                    n_test=1024,
                                                    data_distribution=dist,
                                                    lr=lr,
                                                    bias_init_gain=big,
                                                    bias_init_mode=bim,
                                                    init_param=init_param,
                                                    n_epochs=10000,
                                                    seed=0,
                                                    device=device,
                                                    n_hidden=256,
                                                    opt=opt)
                    results = trainer.fit(do_plot=False, verbose=False)
                    if results is None:
                        print('Got NaN values')
                    utils.serialize(file, {
                        'trainer': trainer,
                        'results': results
                    })
Ejemplo n.º 5
0
def save_best_lrs():
    base_path = Path(custom_paths.get_results_path()) / 'nn_comparison'

    for results_dir in base_path.iterdir():
        if not results_dir.is_dir():
            continue
        bestlr_filename = base_path / f'{results_dir.name}_bestlr.pkl'
        if utils.existsFile(bestlr_filename):
            continue  # has already been computed, don't recompute
            # since maybe now results from run_finer_lrs are there and would change best_lr
        valid_dir_results = []
        for results_file in results_dir.iterdir():
            results = utils.deserialize(results_file)
            if results['results'] is not None:
                valid_dir_results.append(results)

        if len(valid_dir_results) > 0:
            best_idx = np.argmin(
                [r['results']['best_valid_rmse'] for r in valid_dir_results])
            best_lr = valid_dir_results[best_idx]['trainer'].lr
            print(best_lr)
            utils.serialize(bestlr_filename, best_lr)
def train_best_feature_map(name,
                           layer_sizes,
                           n,
                           act,
                           n_iterations=1000,
                           n_mc=1000,
                           batch_size=1024,
                           last_layer_act=True):
    # Trains a feature map to minimize Enoise for the given value of n
    torch.manual_seed(0)
    device = get_default_device()
    weight_factor = act(torch.randn(10000, dtype=torch.float64,
                                    device=device)).std().item()
    weight_factors = [1.0] + [weight_factor] * len(layer_sizes[1:-1])
    x_sampler = NormalXSampler(dim=layer_sizes[0])
    acts = [act] * (len(layer_sizes) - 2) + [
        act if last_layer_act else identity
    ]
    model = nn.Sequential(*[
        WeightActLayer(d_in, d_out, act_fn, weight_factor, use_bias=True)
        for (d_in, d_out, weight_factor, act_fn
             ) in zip(layer_sizes[:-1], layer_sizes[1:], weight_factors, acts)
    ])
    filename = Path('models') / name / 'model.p'
    if utils.existsFile(filename):
        print('Loading serialized model')
        model.load_state_dict(utils.deserialize(filename))
        model = model.to(device)
    else:
        model = model.to(device)
        max_lr = 1e-3
        opt = torch.optim.Adam(model.parameters(),
                               lr=max_lr,
                               betas=(0.9, 0.999),
                               amsgrad=True)
        lam = 1e-12
        for i in range(n_iterations):
            print(f'Iteration {i+1}/{n_iterations}')
            for group in opt.param_groups:
                group['lr'] = max_lr * (1 - i / n_iterations)
            x_cov = x_sampler.sample(n_mc)
            z_cov = model(x_cov)
            Sigma = z_cov.t().matmul(z_cov) / n_mc
            Z = model(x_sampler.sample(n * batch_size)).view(
                batch_size, n, layer_sizes[-1])
            if n < layer_sizes[-1]:  # overparameterized case
                X_pinv = Z.transpose(1, 2).bmm(
                    (Z.bmm(Z.transpose(1, 2)) +
                     lam * torch.eye(Z.shape[1])[None, :, :]).inverse())
            else:  # underparameterized case
                X_pinv = (
                    Z.transpose(1, 2).bmm(Z) +
                    lam * torch.eye(Z.shape[2])[None, :, :]).inverse().bmm(
                        Z.transpose(1, 2))
            prod = X_pinv.bmm(X_pinv.transpose(1, 2))
            mean_trace = (Sigma[None, :, :] *
                          prod).sum(dim=2).sum(dim=1).mean()
            print('Mean trace:', mean_trace.item())
            mean_trace.backward()
            opt.step()
            opt.zero_grad()
        utils.serialize(filename, model.state_dict())
    return FixedFeatureMapSampler(x_sampler,
                                  model,
                                  dim=layer_sizes[-1],
                                  no_grad=True)