Exemple #1
0
    def baseline(devices: List[int]) -> Stuffs:
        L, D = 18, 208

        model = amoebanetd(num_classes=1000, num_layers=L, num_filters=D)
        device = devices[0]
        model.to(device)

        return model, L, D, [torch.device(device)]
Exemple #2
0
    def pipeline8(devices: List[int]) -> Stuffs:
        L, D = 72, 512
        balance = [23, 16, 11, 6, 6, 5, 5, 6]

        model: nn.Module = amoebanetd(num_classes=1000,
                                      num_layers=L,
                                      num_filters=D)
        model = cast(nn.Sequential, model)
        model = GPipe(model, balance, devices=devices, chunks=128)

        return model, L, D, list(model.devices)
Exemple #3
0
def cli(
    ctx: click.Context,
    experiment: str,
    epochs: int,
    skip_epochs: int,
    devices: List[int],
) -> None:
    """AmoebaNet-D (18, 256) Speed Benchmark"""
    if skip_epochs >= epochs:
        ctx.fail('--skip-epochs=%d must be less than --epochs=%d' %
                 (skip_epochs, epochs))

    model: nn.Module = amoebanetd(num_classes=1000,
                                  num_layers=18,
                                  num_filters=256)

    f: Experiment = EXPERIMENTS[experiment]
    try:
        model, batch_size, _devices = f(model, devices)
    except ValueError as exc:
        # Examples:
        #   ValueError: too few devices to hold given partitions (devices: 1, paritions: 2)
        ctx.fail(str(exc))

    optimizer = SGD(model.parameters(), lr=0.1)

    in_device = _devices[0]
    out_device = _devices[-1]
    torch.cuda.set_device(in_device)

    # This experiment cares about only training speed, rather than accuracy.
    # To eliminate any overhead due to data loading, we use fake random 224x224
    # images over 1000 labels.
    dataset_size = 10000

    input = torch.rand(batch_size, 3, 224, 224, device=in_device)
    target = torch.randint(1000, (batch_size, ), device=out_device)
    data = [(input, target)] * (dataset_size // batch_size)

    if dataset_size % batch_size != 0:
        last_input = input[:dataset_size % batch_size]
        last_target = target[:dataset_size % batch_size]
        data.append((last_input, last_target))

    # HEADER ======================================================================================

    title = f'{experiment}, {skip_epochs+1}-{epochs} epochs'
    click.echo(title)

    if isinstance(model, GPipe):
        click.echo(f'batch size: {batch_size}, chunks: {model.chunks}, '
                   f'balance: {model.balance}, checkpoint: {model.checkpoint}')
    else:
        click.echo(f'batch size: {batch_size}')

    click.echo(
        'torchgpipe: %s, python: %s, torch: %s, cudnn: %s, cuda: %s, gpu: %s' %
        (torchgpipe.__version__, platform.python_version(), torch.__version__,
         torch.backends.cudnn.version(), torch.version.cuda,
         torch.cuda.get_device_name(in_device)))

    # TRAIN =======================================================================================

    global BASE_TIME
    BASE_TIME = time.time()

    def run_epoch(epoch: int) -> Tuple[float, float]:
        torch.cuda.synchronize(in_device)
        tick = time.time()

        data_trained = 0
        for i, (input, target) in enumerate(data):
            data_trained += input.size(0)

            output = model(input)
            loss = F.cross_entropy(output, target)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated)
            percent = (i + 1) / len(data) * 100
            throughput = data_trained / (time.time() - tick)
            log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)'
                '' % (epoch + 1, epochs, percent, throughput),
                clear=True,
                nl=False)

        torch.cuda.synchronize(in_device)
        tock = time.time()

        # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch
        elapsed_time = tock - tick
        throughput = dataset_size / elapsed_time
        log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch'
            '' % (epoch + 1, epochs, throughput, elapsed_time),
            clear=True)

        return throughput, elapsed_time

    throughputs = []
    elapsed_times = []

    hr()
    for epoch in range(epochs):
        throughput, elapsed_time = run_epoch(epoch)

        if epoch < skip_epochs:
            continue

        throughputs.append(throughput)
        elapsed_times.append(elapsed_time)
    hr()

    # RESULT ======================================================================================

    # pipeline-4, 2-10 epochs | 200.000 samples/sec, 123.456 sec/epoch (average)
    n = len(throughputs)
    throughput = sum(throughputs) / n
    elapsed_time = sum(elapsed_times) / n
    click.echo('%s | %.3f samples/sec, %.3f sec/epoch (average)'
               '' % (title, throughput, elapsed_time))
Exemple #4
0
def cli(
    ctx: click.Context,
    experiment: str,
    epochs: int,
    skip_epochs: int,
    devices: List[int],
) -> None:
    """AmoebaNet-D Speed Benchmark"""
    if skip_epochs >= epochs:
        ctx.fail('--skip-epochs=%d must be less than --epochs=%d' %
                 (skip_epochs, epochs))

    model: nn.Module = amoebanetd(num_classes=10)

    f = EXPERIMENTS[experiment]
    try:
        model, batch_size, _devices = f(model, devices)
    except ValueError as exc:
        # Examples:
        #   ValueError: too few devices to hold given partitions (devices: 1, paritions: 2)
        ctx.fail(str(exc))

    optimizer = SGD(model.parameters(), lr=0.1)

    in_device = _devices[0]
    out_device = _devices[-1]

    # This experiment cares about only training speed, rather than accuracy. To
    # eliminate any overhead due to data loading, we use a fake dataset with
    # random 224x224 images over 10 labels.
    dataset = RandomDataset()
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        num_workers=1,
        pin_memory=True,
        drop_last=False,
    )

    # HEADER ======================================================================================

    title = '%s, %d-%d epochs' % (experiment, skip_epochs + 1, epochs)
    click.echo(title)
    click.echo('python: %s, torch: %s, cudnn: %s, cuda: %s, gpu: %s' %
               (platform.python_version(), torch.__version__,
                torch.backends.cudnn.version(), torch.version.cuda,
                torch.cuda.get_device_name(in_device)))

    # TRAIN =======================================================================================

    global BASE_TIME
    BASE_TIME = time.time()

    def run_epoch(epoch: int) -> Tuple[float, float]:
        torch.cuda.synchronize(in_device)
        tick = time.time()

        data_trained = 0
        for i, (input, target) in enumerate(loader):
            data_trained += len(input)

            input = input.to(in_device, non_blocking=True)
            target = target.to(out_device, non_blocking=True)

            output = model(input)
            loss = F.cross_entropy(output, target)
            loss.backward()

            optimizer.step()
            optimizer.zero_grad()

            # 00:01:02 | 1/20 epoch (42%) | 200.000 samples/sec (estimated)
            percent = i / len(loader) * 100
            throughput = data_trained / (time.time() - tick)
            log('%d/%d epoch (%d%%) | %.3f samples/sec (estimated)'
                '' % (epoch + 1, epochs, percent, throughput),
                clear=True,
                nl=False)

        torch.cuda.synchronize(in_device)
        tock = time.time()

        # 00:02:03 | 1/20 epoch | 200.000 samples/sec, 123.456 sec/epoch
        elapsed_time = tock - tick
        throughput = len(dataset) / elapsed_time
        log('%d/%d epoch | %.3f samples/sec, %.3f sec/epoch'
            '' % (epoch + 1, epochs, throughput, elapsed_time),
            clear=True)

        return throughput, elapsed_time

    throughputs = []
    elapsed_times = []

    hr()
    for epoch in range(epochs):
        throughput, elapsed_time = run_epoch(epoch)

        if epoch < skip_epochs:
            continue

        throughputs.append(throughput)
        elapsed_times.append(elapsed_time)
    hr()

    # RESULT ======================================================================================

    # pipeline-4, 2-10 epochs | 200.000 samples/sec, 123.456 sec/epoch (average)
    n = len(throughputs)
    throughput = sum(throughputs) / n
    elapsed_time = sum(elapsed_times) / n
    click.echo('%s | %.3f samples/sec, %.3f sec/epoch (average)'
               '' % (title, throughput, elapsed_time))