Exemple #1
0
def test_padadamp(model, dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    opt = PadaDamp(
        model, dataset, _opt, batch_growth_rate=1, initial_batch_size=4, dwell=1
    )
    data: List[Dict[str, Any]] = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data += train_data
    df = pd.DataFrame(data)
    assert (df.damping >= 1).all()
Exemple #2
0
def test_dwell_init_geo_increase(model, dataset):
    dwell = 512
    _opt = optim.Adagrad(model.parameters(), lr=1)
    # batch_growth_rate=1: every model update increase the batch size by 1
    opt = PadaDamp(
        model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1
    )
    data = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data.extend(train_data)
    df = pd.DataFrame(data)
    cbs = np.arange(64) + 1  # cnts batch size
    dbs = [[cbs[2 ** i]] * 2 ** i for i in range(4)]  # discrete bs
    dbs = sum(dbs, [])
    assert len(dbs) == 15
    # Because of exponential increase initially for geodamp
    assert (df.batch_size.iloc[1 : 1 + len(dbs)] <= np.array(dbs)).all()
    dbs = [[cbs[2**i]] * 2**i for i in range(4)]  # discrete bs
    dbs = sum(dbs, [])
    assert len(dbs) == 15
    assert (df.batch_size.iloc[1:1 + len(dbs)] <= np.array(dbs)).all()
Exemple #3
0
def test_dwell(dwell, model, dataset):
    _opt = optim.Adadelta(model.parameters(), lr=1)
    # batch_growth_rate=1: every model update increase the batch size by 1
    opt = PadaDamp(
        model, dataset, _opt, dwell=dwell, initial_batch_size=4, batch_growth_rate=1
    )
    data = []
    for epoch in range(1, 16 + 1):
        model, opt, meta, train_data = experiment.train(model, opt)
        data.extend(train_data)
    df = pd.DataFrame(data)

    # Because geometric delay... (tested below)
    damping = df.damping.iloc[dwell:]

    chunks = [
        damping.iloc[dwell * k : dwell * (k + 1)].values
        for k in range(len(df) // dwell)
    ]
    chunks = [c for c in chunks if len(c)]
    if dwell > 1:
        assert all(np.allclose(np.diff(c), 0) for c in chunks[1:])
    else:
        assert all(len(c) <= 1 for c in chunks)
Exemple #4
0
def test_main():
    from adadamp.experiment import train, test

    # Training settings
    args = SimpleNamespace(
        batch_size=1024,
        epochs=2,
        log_interval=10,
        lr=0.1,
        no_cuda=False,
        save_model=False,
        seed=1,
        test_batch_size=1000,
    )

    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if use_cuda else "cpu")

    train_set = datasets.MNIST(
        "../data",
        train=True,
        download=True,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    )
    test_set = datasets.MNIST(
        "../data",
        train=False,
        transform=transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ]),
    )

    # Only for making tests run faster
    dataset, _ = torch.utils.data.random_split(
        train_set, [2000, len(train_set) - 2000])
    train_set, test_set = torch.utils.data.random_split(dataset, [1000, 1000])

    kwargs = {"num_workers": 1, "pin_memory": True} if use_cuda else {}

    model = Net().to(device)
    _optimizer = optim.SGD(model.parameters(), lr=args.lr)
    loss = F.nll_loss
    optimizer = PadaDamp(
        model=model,
        dataset=train_set,
        opt=_optimizer,
        loss=loss,
        device="cpu",
        batch_growth_rate=0.1,
        initial_batch_size=32,
        max_batch_size=1024,
    )

    print("Starting...")
    for epoch in range(1, args.epochs + 1):
        train(model=model, opt=optimizer, verbose=10)
        data = test(model=model, loss=loss, dataset=test_set)
        print(data)

    if args.save_model:
        torch.save(model.state_dict(), "mnist_cnn.pt")
Exemple #5
0
def main(
    dataset: str = "fashionmnist",
    initial_batch_size: int = 64,
    epochs: int = 6,
    verbose: Union[int, bool] = False,
    lr: float = 1.0,
    cuda: bool = False,
    random_state: Optional[int] = None,  # seed to pass to BaseDamper
    init_seed: Optional[int] = None,  # seed for initialization
    tuning: bool = True,  # tuning seed
    damper: str = "geodamp",
    batch_growth_rate: float = 0.01,
    dampingfactor: Number = 5.0,
    dampingdelay: int = 5,
    max_batch_size: Optional[int] = None,
    test_freq: float = 1,
    approx_loss: bool = False,
    rho: float = 0.9,
    dwell: int = 1,
    approx_rate: bool = False,
    model: Optional[str] = None,
    momentum: Optional[Union[float, int]] = 0,
    nesterov: bool = False,
    weight_decay: float = 0,
) -> Tuple[List[Dict], List[Dict]]:
    # Get (tuning, random_state, init_seed)
    assert int(tuning) or isinstance(tuning, bool)
    assert isinstance(random_state, int)
    assert isinstance(init_seed, int)

    if "NUM_THREADS" in os.environ:
        v = os.environ["NUM_THREADS"]
        if v:
            print(f"NUM_THREADS={v} (int(v)={int(v)})")
            torch.set_num_threads(int(v))

    args: Dict[str, Any] = {
        "initial_batch_size": initial_batch_size,
        "max_batch_size": max_batch_size,
        "batch_growth_rate": batch_growth_rate,
        "dampingfactor": dampingfactor,
        "dampingdelay": dampingdelay,
        "epochs": epochs,
        "verbose": verbose,
        "lr": lr,
        "no_cuda": not cuda,
        "random_state": random_state,
        "init_seed": init_seed,
        "damper": damper,
        "dataset": dataset,
        "approx_loss": approx_loss,
        "test_freq": test_freq,
        "rho": rho,
        "dwell": dwell,
        "approx_rate": approx_rate,
        "nesterov": nesterov,
        "momentum": momentum,
        "weight_decay": weight_decay,
    }
    pprint(args)

    no_cuda = not cuda
    args["ident"] = ident(args)
    args["tuning"] = tuning

    use_cuda = not args["no_cuda"] and torch.cuda.is_available()
    device = "cuda" if use_cuda else "cpu"
    _device = torch.device(device)
    _set_seed(args["init_seed"])

    transform_train = [
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize(mean=(0.1307, ), std=(0.3081, )),
    ]
    transform_test = [
        transforms.ToTensor(),
        transforms.Normalize((0.1307, ), (0.3081, ))
    ]
    assert dataset in ["fashionmnist", "cifar10", "synthetic"]
    if dataset == "fashionmnist":
        _dir = "_traindata/fashionmnist/"
        train_set = FashionMNIST(
            _dir,
            train=True,
            transform=Compose(transform_train),
            download=True,
        )
        test_set = FashionMNIST(_dir,
                                train=False,
                                transform=Compose(transform_test))
        model = Net()
    elif dataset == "cifar10":
        transform_train = [
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ]

        transform_test = [
            transforms.ToTensor(),
            transforms.Normalize((0.4914, 0.4822, 0.4465),
                                 (0.2023, 0.1994, 0.2010)),
        ]

        _dir = "_traindata/cifar10/"
        train_set = CIFAR10(
            _dir,
            train=True,
            transform=Compose(transform_train),
            download=True,
        )
        test_set = CIFAR10(_dir,
                           train=False,
                           transform=Compose(transform_test))
        if model == "wideresnet":
            model = WideResNet(16, 4, 0.3, 10)
        else:
            model = _get_resnet18()
    elif dataset == "synthetic":
        data_kwargs = {"n": 10_000, "d": 100}
        args.update(data_kwargs)
        train_set, test_set, data_stats = synth_dataset(**data_kwargs)
        args.update(data_stats)
        model = LinearNet(data_kwargs["d"])
    else:
        raise ValueError(
            f"dataset={dataset} not in ['fashionmnist', 'cifar10', 'synth']")
    if tuning:
        train_size = int(0.8 * len(train_set))
        test_size = len(train_set) - train_size

        train_set, test_set = random_split(
            train_set,
            [train_size, test_size],
            random_state=int(tuning),
        )
        train_x = [x.abs().sum().item() for x, _ in train_set]
        train_y = [y for _, y in train_set]
        test_x = [x.abs().sum().item() for x, _ in test_set]
        test_y = [y for _, y in test_set]
        data_stats = {
            "train_x_sum": sum(train_x),
            "train_y_sum": sum(train_y),
            "test_x_sum": sum(test_x),
            "test_y_sum": sum(test_y),
            "len_train_x": len(train_x),
            "len_train_y": len(train_y),
            "len_test_x": len(test_x),
            "len_test_y": len(test_y),
            "tuning": int(tuning),
        }
        args.update(data_stats)
        pprint(data_stats)

    model = model.to(_device)
    _set_seed(args["random_state"])

    if args["damper"] == "adagrad":
        optimizer = optim.Adagrad(model.parameters(), lr=args.get("lr", 0.01))
    elif args["damper"] == "adadelta":
        optimizer = optim.Adadelta(model.parameters(), rho=rho)
    else:
        if not args["nesterov"]:
            assert args["momentum"] == 0
        optimizer = optim.SGD(model.parameters(),
                              lr=args["lr"],
                              nesterov=args["nesterov"],
                              momentum=args["momentum"],
                              weight_decay=args["weight_decay"])
    n_data = len(train_set)

    opt_args = [model, train_set, optimizer]
    opt_kwargs = {
        k: args[k]
        for k in ["initial_batch_size", "max_batch_size", "random_state"]
    }
    opt_kwargs["device"] = device
    if dataset == "synthetic":
        opt_kwargs["loss"] = F.mse_loss
    if dataset == "cifar10":
        opt_kwargs["loss"] = F.cross_entropy
    if args["damper"].lower() == "padadamp":
        if approx_rate:
            assert isinstance(max_batch_size, int)
            BM = max_batch_size
            B0 = initial_batch_size
            e = epochs
            n = n_data
            r_hat = 4 / 3 * (BM - B0) * (B0 + 2 * BM + 3)
            r_hat /= (2 * BM - 2 * B0 + 3 * e * n)
            args["batch_growth_rate"] = r_hat

        opt = PadaDamp(
            *opt_args,
            batch_growth_rate=args["batch_growth_rate"],
            dwell=args["dwell"],
            **opt_kwargs,
        )
    elif args["damper"].lower() == "geodamp":
        opt = GeoDamp(
            *opt_args,
            dampingdelay=args["dampingdelay"],
            dampingfactor=args["dampingfactor"],
            **opt_kwargs,
        )
    elif args["damper"].lower() == "geodamplr":
        opt = GeoDampLR(
            *opt_args,
            dampingdelay=args["dampingdelay"],
            dampingfactor=args["dampingfactor"],
            **opt_kwargs,
        )
    elif args["damper"].lower() == "cntsdamplr":
        opt = CntsDampLR(
            *opt_args,
            dampingfactor=args["dampingfactor"],
            **opt_kwargs,
        )
    elif args["damper"].lower() == "adadamp":
        opt = AdaDamp(*opt_args,
                      approx_loss=approx_loss,
                      dwell=args["dwell"],
                      **opt_kwargs)
    elif args["damper"].lower() == "gd":
        opt = GradientDescent(*opt_args, **opt_kwargs)
    elif (args["damper"].lower() in ["adagrad", "adadelta", "sgd", "gd"]
          or args["damper"] is None):
        opt = BaseDamper(*opt_args, **opt_kwargs)
    else:
        raise ValueError("argument damper not recognized")
    if dataset == "synthetic":
        pprint(data_stats)
        opt._meta["best_train_loss"] = data_stats["best_train_loss"]

    data, train_data = experiment.run(
        model=model,
        opt=opt,
        train_set=train_set,
        test_set=test_set,
        args=args,
        test_freq=test_freq,
        train_stats=dataset == "synthetic",
        verbose=verbose,
        device="cuda" if use_cuda else "cpu",
    )
    return data, train_data