def test_lr_decays(model, dataset): _opt = optim.SGD(model.parameters(), lr=1) # batch_growth_rate=1: every model update increase the batch size by 1 opt = GeoDamp( model, dataset, _opt, dwell=1, initial_batch_size=4, dampingdelay=3, dampingfactor=2, max_batch_size=8, ) data = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data.extend(train_data) df = pd.DataFrame(data) damping_factor = df.damping / 4 # Damping always increasing/decreasing assert (np.diff(df.batch_size) >= 0).all() assert (np.diff(df.lr_) <= 0).all() assert (np.diff(damping_factor) >= 0).all() # Make sure increases by correct amounts assert set(damping_factor.unique()) == {1, 2, 4, 8, 16, 32} assert set(df.batch_size) == {4, 8} assert set(df.lr_) == {1, 1/2, 1/4, 1/8, 1/16}
def test_padadamp(model, dataset): _opt = optim.Adadelta(model.parameters(), lr=1) opt = PadaDamp( model, dataset, _opt, batch_growth_rate=1, initial_batch_size=4, dwell=1 ) data: List[Dict[str, Any]] = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data += train_data df = pd.DataFrame(data) assert (df.damping >= 1).all()
def test_gradient_descent(model, dataset): init_bs = 8 _opt = optim.SGD(model.parameters(), lr=0.500) opt = GradientDescent(model, dataset, _opt) data: List[Dict[str, Any]] = [] initial_loss = opt._get_loss() for epoch in range(5): model, opt, meta, train_data = experiment.train(model, opt) data += train_data df = pd.DataFrame(data) assert (df.batch_loss.diff().dropna() < 0).all() assert (df.len_dataset == df.batch_size).all() assert np.allclose(df.epochs.diff().dropna(), 1)
def test_basics(model, dataset, epochs=14): optimizer = optim.Adadelta(model.parameters(), lr=1) opt = BaseDamper(model, dataset, optimizer, initial_batch_size=8) data: List[Dict[str, Any]] = [] for epoch in range(1, epochs + 1): model, opt, meta, train_data = experiment.train(model, opt) data += train_data df = pd.DataFrame(data) assert (df.model_updates * df.batch_size == df.num_examples).all() assert df.epochs.max() <= epochs + 2 eg_per_epoch = df.num_examples.diff().iloc[1:] len_dataset = df.len_dataset.iloc[1:] assert all((eg_per_epoch - len_dataset) <= df.batch_size.iloc[1:])
def test_adadamp(model, dataset): init_bs = 8 _opt = optim.SGD(model.parameters(), lr=0.500) opt = AdaDamp(model, dataset, _opt, initial_batch_size=init_bs) data: List[Dict[str, Any]] = [] initial_loss = opt._get_loss() for epoch in range(5): model, opt, meta, train_data = experiment.train(model, opt) data += train_data df = pd.DataFrame(data) bs_hat = init_bs * df.loc[0, "_complete_loss"] / df._complete_loss bs_hat = bs_hat.values.astype(int) + 1 bs = df.batch_size.values assert (bs == bs_hat).all()
def test_large_batch_size(model, large_dataset): _opt = optim.Adadelta(model.parameters(), lr=1) opt = BaseDamper(model, large_dataset, _opt, initial_batch_size=1024) data: List[Dict[str, Any]] = [] data2: List[Dict[str, Any]] = [] for epoch in range(1, 16 + 1): model, opt, meta, _ = experiment.train(model, opt) data.append(opt.meta) data2.append(meta) df = pd.DataFrame(data) # Make sure the loss is decreasing assert df.batch_loss.diff().median() < -0.01 assert df.batch_loss.diff().mean() < -0.01 assert 2.25 < df.loc[0, "batch_loss"] assert df.loc[15, "batch_loss"] < 2.06
def test_geodamp(model, dataset): _opt = optim.Adadelta(model.parameters(), lr=1) opt = GeoDamp(model, dataset, _opt, initial_batch_size=1, dampingdelay=4, dampingfactor=2) data: List[Dict[str, Any]] = [] # Specifically let GeoDamp train for at least one epoch for epoch in range(1, 16 + 1): model, opt, meta, _ = experiment.train(model, opt) data.append(opt.meta) df = pd.DataFrame(data) # Check to make sure it's exactly one epoch assert np.allclose(df.epochs, np.floor(df.epochs)) counts = df.damping.value_counts() assert set(counts.index.astype(int)) == {1, 2, 4, 8} assert np.allclose(counts.unique(), 4)
def test_dwell_init_geo_increase(model, dataset): dwell = 512 _opt = optim.Adagrad(model.parameters(), lr=1) # batch_growth_rate=1: every model update increase the batch size by 1 opt = PadaDamp( model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1 ) data = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data.extend(train_data) df = pd.DataFrame(data) cbs = np.arange(64) + 1 # cnts batch size dbs = [[cbs[2 ** i]] * 2 ** i for i in range(4)] # discrete bs dbs = sum(dbs, []) assert len(dbs) == 15 # Because of exponential increase initially for geodamp assert (df.batch_size.iloc[1 : 1 + len(dbs)] <= np.array(dbs)).all() dbs = [[cbs[2**i]] * 2**i for i in range(4)] # discrete bs dbs = sum(dbs, []) assert len(dbs) == 15 assert (df.batch_size.iloc[1:1 + len(dbs)] <= np.array(dbs)).all()
def test_avg_loss(model, dataset): """ Test that BaseDamper._get_loss returns mean loss regardless of how many points are sampled. """ _opt = optim.Adadelta(model.parameters(), lr=1) opt = BaseDamper(model, dataset, _opt) for epoch in range(1, 16 + 1): model, opt, meta, _ = experiment.train(model, opt) loss = [{ "loss": opt._get_loss(frac=frac), "frac": frac, "repeat": repeat } for frac in np.linspace(0.5, 0.99, num=5) for repeat in range(5)] total_loss = opt._get_loss(frac=1) df = pd.DataFrame(loss) summary = df.pivot(index="frac", columns="repeat", values="loss") abs_error = np.abs(df.loss - total_loss) rel_error = abs_error / total_loss assert rel_error.max() <= 0.125 assert np.percentile(rel_error, 50) <= 0.12 assert 1.5 <= total_loss <= 2.2 assert abs_error.max() <= 0.17
def test_dwell(dwell, model, dataset): _opt = optim.Adadelta(model.parameters(), lr=1) # batch_growth_rate=1: every model update increase the batch size by 1 opt = PadaDamp( model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1 ) data = [] for epoch in range(1, 16 + 1): model, opt, meta, train_data = experiment.train(model, opt) data.extend(train_data) df = pd.DataFrame(data) # Because geometric delay... (tested below) damping = df.damping.iloc[dwell:] chunks = [ damping.iloc[dwell * k : dwell * (k + 1)].values for k in range(len(df) // dwell) ] chunks = [c for c in chunks if len(c)] if dwell > 1: assert all(np.allclose(np.diff(c), 0) for c in chunks[1:]) else: assert all(len(c) <= 1 for c in chunks)
def test_main(): from adadamp.experiment import train, test # Training settings args = SimpleNamespace( batch_size=1024, epochs=2, log_interval=10, lr=0.1, no_cuda=False, save_model=False, seed=1, test_batch_size=1000, ) use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") train_set = datasets.MNIST( "../data", train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) test_set = datasets.MNIST( "../data", train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ]), ) # Only for making tests run faster dataset, _ = torch.utils.data.random_split( train_set, [2000, len(train_set) - 2000]) train_set, test_set = torch.utils.data.random_split(dataset, [1000, 1000]) kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {} model = Net().to(device) _optimizer = optim.SGD(model.parameters(), lr=args.lr) loss = F.nll_loss optimizer = PadaDamp( model=model, dataset=train_set, opt=_optimizer, loss=loss, device="cpu", batch_growth_rate=0.1, initial_batch_size=32, max_batch_size=1024, ) print("Starting...") for epoch in range(1, args.epochs + 1): train(model=model, opt=optimizer, verbose=10) data = test(model=model, loss=loss, dataset=test_set) print(data) if args.save_model: torch.save(model.state_dict(), "mnist_cnn.pt")