def run_finer_lrs(init_param='kaiming', device='cpu'): dist_grid = [ExampleDistribution() ] + [RadialDataDistribution(d=2**k) for k in range(7)] std_grid = [0.1, 0.5, 1.0, 2.0] # bi_grid = [('zero', 0.0), ('he+5', 0.0), ('he+1', 0.0), ('kink_uniform', 0.0)] \ # + [(bim, big) for big in std_grid for bim in ['normal', 'uniform']] \ # + [('pos-unif', 1.0), ('neg-unif', 1.0), ('kink-unif', 1.0), ('kink-neg-unif', 1.0), # ('kink-neg-point', 0.0)] bi_grid = [('zero', 0.0), ('unif', 1.0), ('unif-pos', 1.0), ('unif-neg', 1.0), ('kink-neg-unif', 1.0), ('pytorch', 1.0), ('kink-neg-point', 0.0)] for opt in ['gd', 'gd-mom', 'adam']: for dist in dist_grid: d = dist.get_x_dim() for bim, big in bi_grid: folder_name = f'{init_param}_{opt}_{dist.get_name()}_{bim}-{big:g}' path = Path(custom_paths.get_results_path() ) / 'nn_comparison' / folder_name best_lr_file = Path(custom_paths.get_results_path( )) / 'nn_comparison' / f'{folder_name}_bestlr.pkl' if not utils.existsFile(best_lr_file): sys.stderr.write( 'best lr file {best_lr_file} does not exist!\n') continue best_lr = utils.deserialize(best_lr_file) lr_grid = [best_lr * (2**(k / 8)) for k in range(-3, 4)] for lr in lr_grid: print(f'Running combination {folder_name} with lr {lr:g}') file = path / f'{lr:g}.pkl' utils.ensureDir(file) if utils.existsFile(file): continue n_rep = 2 if d == 64 else 1 trainer = SimpleParallelTrainer(n_parallel=100 // n_rep, n_train=256 * d, n_valid=1024, n_test=1024, data_distribution=dist, lr=lr, bias_init_gain=big, batch_size=256, bias_init_mode=bim, init_param=init_param, n_epochs=8192 // d, seed=0, device=device, n_hidden=512, opt=opt, valid_epoch_interval=64 // d, n_rep=n_rep) results = trainer.fit(do_plot=False, verbose=False) if results is None: print('Got NaN values') utils.serialize(file, { 'trainer': trainer, 'results': results })
def run_training(n_hidden=256, ds_type='2d_star_11', n_parallel=1000, n_epochs=1000000, random_bias=False, act='relu', n_layers=1, device_number=0, version=0): print('Start time:', datetime.datetime.now()) if n_layers > 1: name = f'{n_hidden}x{n_layers}-{n_parallel}-{random_bias}-{act}-v{version}' else: name = f'{n_hidden}-{n_parallel}-{random_bias}-{act}-v{version}' x_train, y_train = get_2d_star_dataset(k=11, dist=0.1) print(f'Running model for {n_epochs} epochs on dataset {ds_type}: {name}') base_dir = Path(get_results_path()) file_dir = base_dir/ds_type/name file_path = file_dir/'model_trainer.p' if utils.existsFile(file_path): print('Loading existing model') mt = utils.deserialize(file_path) mt.to(get_device(device_number)) else: print('Creating new model') mt = ModelTrainer(x_train, y_train, n_parallel=n_parallel, hidden_sizes=[n_hidden] * n_layers, n_virtual_samples=n_hidden**2, random_bias=random_bias, act=act, device_number=device_number, version=version) mt.train(n_epochs) mt.to('cpu') utils.serialize(file_path, mt) utils.serialize(file_dir/'config.p', dict(ds_type=ds_type, n_parallel=n_parallel, n_layers=n_layers, random_bias=random_bias, act=act, n_epochs=n_epochs, version=version)) print('Saved trained model') print('End time:', datetime.datetime.now())
def compute_dd_results(name, sampler, n_rep=10, n_parallel=1000, **kwargs): # computes the results in multiple repetitions and saves them # but only if the results are not already computed for rep in range(n_rep): print(f'Repetition {rep+1}/{n_rep}') filename = Path( 'data/double_descent/') / name / f'v{rep}_{n_parallel}.p' if utils.existsFile(filename): print('Results have already been computed') continue results = DoubleDescentResults(**kwargs, random_seed=rep, n_parallel=n_parallel) results.compute(sampler) utils.serialize(filename, results)
def run_old(init_param='kaiming', device='cpu'): dist_grid = [ExampleDistribution() ] + [RBFDataDistribution(d=2**k) for k in range(7)] std_grid = [0.1, 0.5, 1.0, 2.0] bi_grid = [('zero', 0.0), ('he+5', 0.0), ('he+1', 0.0), ('kink_uniform', 0.0)] \ + [(bim, big) for big in std_grid for bim in ['normal', 'uniform']] for opt in ['gd', 'gd-mom', 'adam']: base_lr = 1e-2 if opt == 'adam' else ( 4e-1 if init_param == 'ntk' else 8e-3) lr_grid = [base_lr * np.sqrt(2)**k for k in range(-8, 9)] for dist in dist_grid: for bim, big in bi_grid: folder_name = f'{init_param}_{opt}_{dist.get_name()}_{bim}-{big:g}' path = Path(custom_paths.get_results_path() ) / 'nn_comparison' / folder_name for lr in lr_grid: print(f'Running combination {folder_name} with lr {lr:g}') file = path / f'{lr:g}.pkl' utils.ensureDir(file) if utils.existsFile(file): continue torch.cuda.empty_cache() trainer = SimpleParallelTrainer(n_parallel=100, n_train=256, n_valid=1024, n_test=1024, data_distribution=dist, lr=lr, bias_init_gain=big, bias_init_mode=bim, init_param=init_param, n_epochs=10000, seed=0, device=device, n_hidden=256, opt=opt) results = trainer.fit(do_plot=False, verbose=False) if results is None: print('Got NaN values') utils.serialize(file, { 'trainer': trainer, 'results': results })
def save_best_lrs(): base_path = Path(custom_paths.get_results_path()) / 'nn_comparison' for results_dir in base_path.iterdir(): if not results_dir.is_dir(): continue bestlr_filename = base_path / f'{results_dir.name}_bestlr.pkl' if utils.existsFile(bestlr_filename): continue # has already been computed, don't recompute # since maybe now results from run_finer_lrs are there and would change best_lr valid_dir_results = [] for results_file in results_dir.iterdir(): results = utils.deserialize(results_file) if results['results'] is not None: valid_dir_results.append(results) if len(valid_dir_results) > 0: best_idx = np.argmin( [r['results']['best_valid_rmse'] for r in valid_dir_results]) best_lr = valid_dir_results[best_idx]['trainer'].lr print(best_lr) utils.serialize(bestlr_filename, best_lr)
def train_best_feature_map(name, layer_sizes, n, act, n_iterations=1000, n_mc=1000, batch_size=1024, last_layer_act=True): # Trains a feature map to minimize Enoise for the given value of n torch.manual_seed(0) device = get_default_device() weight_factor = act(torch.randn(10000, dtype=torch.float64, device=device)).std().item() weight_factors = [1.0] + [weight_factor] * len(layer_sizes[1:-1]) x_sampler = NormalXSampler(dim=layer_sizes[0]) acts = [act] * (len(layer_sizes) - 2) + [ act if last_layer_act else identity ] model = nn.Sequential(*[ WeightActLayer(d_in, d_out, act_fn, weight_factor, use_bias=True) for (d_in, d_out, weight_factor, act_fn ) in zip(layer_sizes[:-1], layer_sizes[1:], weight_factors, acts) ]) filename = Path('models') / name / 'model.p' if utils.existsFile(filename): print('Loading serialized model') model.load_state_dict(utils.deserialize(filename)) model = model.to(device) else: model = model.to(device) max_lr = 1e-3 opt = torch.optim.Adam(model.parameters(), lr=max_lr, betas=(0.9, 0.999), amsgrad=True) lam = 1e-12 for i in range(n_iterations): print(f'Iteration {i+1}/{n_iterations}') for group in opt.param_groups: group['lr'] = max_lr * (1 - i / n_iterations) x_cov = x_sampler.sample(n_mc) z_cov = model(x_cov) Sigma = z_cov.t().matmul(z_cov) / n_mc Z = model(x_sampler.sample(n * batch_size)).view( batch_size, n, layer_sizes[-1]) if n < layer_sizes[-1]: # overparameterized case X_pinv = Z.transpose(1, 2).bmm( (Z.bmm(Z.transpose(1, 2)) + lam * torch.eye(Z.shape[1])[None, :, :]).inverse()) else: # underparameterized case X_pinv = ( Z.transpose(1, 2).bmm(Z) + lam * torch.eye(Z.shape[2])[None, :, :]).inverse().bmm( Z.transpose(1, 2)) prod = X_pinv.bmm(X_pinv.transpose(1, 2)) mean_trace = (Sigma[None, :, :] * prod).sum(dim=2).sum(dim=1).mean() print('Mean trace:', mean_trace.item()) mean_trace.backward() opt.step() opt.zero_grad() utils.serialize(filename, model.state_dict()) return FixedFeatureMapSampler(x_sampler, model, dim=layer_sizes[-1], no_grad=True)