Python MultiStageChrono.time Exemples, benchutils.chrono.MultiStageChrono.time Python Exemples

Exemple #1

0

Afficher le fichier

def run_check(spec, repeat=10, number=20, report_name=None):
    chrono = MultiStageChrono(skip_obs=2)

    args = spec['args']
    input_gen = spec['inputs']
    algos = spec['algos']
    batch_sizes = spec['batch_size']
    get_output_layer = spec['get_output_layer']
    get_output_size = spec['get_output_size']

    for algo, tensor_sizes in algos:
        for arg in args:
            # initialize the conv layer that we will benchmark
            layer = algo(**arg).cuda()

            for batch_size in batch_sizes:
                for tensor_size in tensor_sizes:
                    name = f'algo={algo.__name__},batch={batch_size},tensor={tensor_size},arg={arg}'
                    print(name)
                    try:
                        input = input_gen(layer, batch_size, tensor_size)
                        target = None
                        size = None
                        criterion = nn.MSELoss()

                        # Benchmark the layer
                        for i in range(0, repeat):
                            # ---
                            with chrono.time(name) as timer:
                                for _ in range(0, number):
                                    out = layer(*input)
                                    out = get_output_layer(*out)

                                    if target is None:
                                        if get_output_size is None:
                                            size = reduce(mul, out.shape[1:])
                                        else:
                                            size = get_output_size(out.shape)

                                        target = torch.randn(batch_size, size).cuda()

                                    loss = criterion(target, out.view(-1, size))
                                    loss.backward()

                                    torch.cuda.synchronize()

                        print(f'  Ran in {timer.avg:5.2f}s {timer.avg * repeat:5.2f}s')
                        # ---
                    except Exception as e:
                        print(f'[!] > {e}')
                        print(traceback.format_exc())

    report = chrono.to_json(indent=2)
    print(report)

    if report_name is not None:
        json.dump(report, open(report_name, 'w'), indent=2)

Exemple #2

0

Afficher le fichier

Fichier : minimal.py Projet : Delaunay/MixedPrecision

def test_cnn_base():
    # torch.Size([2, 4, 84, 84])

    device = torch.device('cuda')
    print(torch.cuda.get_device_name(device))

    print(CNNBase((4, 84, 84)[0], hidden_size=512))

    prev = None
    for batch_size in [2, 4, 8, 16, 32, 64, 128]:
        chrono = MultiStageChrono()
        repeat = 30
        exp = f'forward_{batch_size}'

        input = torch.rand(batch_size, 4, 84, 84).cuda()

        net = CNNBase((4, 84, 84)[0], hidden_size=512)
        net.cuda()

        for _ in range(0, 10):
            with chrono.time(exp):
                for _ in range(0, repeat):
                    net(input, None, None)
                torch.cuda.synchronize()

        for _ in range(0, 30):
            with chrono.time(exp):
                for _ in range(0, repeat):
                    net(input, None, None)
                torch.cuda.synchronize()

        speed = batch_size * repeat / chrono.chronos[exp].avg
        speed_up = ''
        if prev:
            speed_up = f'Speed up x{speed / prev:7.4f}'

        print(f'{exp:>30} {speed:12,.4f} item/sec {speed_up}')
        prev = speed

Exemple #3

0

Afficher le fichier

Fichier : train.py Projet : Delaunay/MixedPrecision

def train(models,
          epochs,
          dataset,
          olr,
          lr_reset_threshold=1e-05,
          output_name='/tmp/',
          device_name='gpu'):

    device = torch.device(device_name)

    train_loader = torch.utils.data.DataLoader(batch_size=64,
                                               shuffle=True,
                                               num_workers=4,
                                               dataset=dataset)

    if torch.cuda.is_available():
        nd = torch.cuda.device_count()
        devices = [
            torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count())
        ]
    else:
        nd = 1
        devices = [torch.device('cpu')]

    dataset_size = len(train_loader)
    models_optim = {}

    for name, model in models.items():
        model = model.to(device)
        optimizer = WindowedSGD(model.parameters(),
                                epoch_steps=dataset_size,
                                window=dataset_size,
                                lr_min=lr_reset_threshold,
                                lr=olr)

        model.train()
        models_optim[name] = (model, optimizer)

    epoch_time = MultiStageChrono(name='train', skip_obs=10)
    costs = []
    print('Start training')
    for e in range(0, epochs):
        all_cost = [0] * len(models_optim)

        with epoch_time.time('epoch') as step_time:
            for batch_idx, (data, target) in enumerate(train_loader):

                with epoch_time.time('models'):

                    for mid, (name,
                              (model,
                               optimizer)) in enumerate(models_optim.items()):
                        device = devices[mid % nd]

                        if torch.cuda.is_available():
                            torch.cuda.set_device(device)

                        # g1, torch.float32, True, False)
                        data = data.to(device, torch.float, True, True)
                        target = target.to(device, torch.long, True, True)

                        model = model.to(device)

                        with epoch_time.time(model):
                            optimizer.zero_grad()

                            output = model(data)
                            loss = F.nll_loss(output, target)
                            loss.backward()

                            all_cost[mid] += loss.item()
                            optimizer.step(loss)

                            if torch.cuda.is_available():
                                torch.cuda.synchronize()
                        # ---
                    # ---
                # ---
            # ---

        with epoch_time.time('check_point'):
            for name, (model, _) in models_optim.items():
                torch.save(model.state_dict(), f'{output_name}/{name}_{e}')

        infos = [
            f'{all_cost[idx]:8.2f}, {models_optim[name][1].lr:10.8f}'
            for idx, name in enumerate(models_optim)
        ]

        print(f'{e:3d}/{epochs:3d}, {step_time.val:6.2f}, ' + ', '.join(infos))

        costs.append(all_cost)

    print(epoch_time.to_json())
    return costs

Exemple #4

0

Afficher le fichier

Fichier : convnet.py Projet : Delaunay/MixedPrecision

def main():
    # ----
    parser = argparse.ArgumentParser()

    parser.add_argument('--data', metavar='DIR', help='path to dataset')
    parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18')
    parser.add_argument('--lr',
                        '--learning-rate',
                        default=0.1,
                        type=float,
                        metavar='LR')
    parser.add_argument('--opt-level', type=str)
    parser.add_argument('--cuda',
                        action='store_true',
                        default=True,
                        dest='cuda')
    parser.add_argument('--no-cuda', action='store_false', dest='cuda')
    parser.add_argument('--batch-size', type=int, default=128)
    parser.add_argument('--loader', type=str, default='torch')
    parser.add_argument('--prof', type=int, default=None)
    parser.add_argument('--workers', type=int, default=4)
    parser.add_argument('--seed', type=int, default=4)
    parser.add_argument('--epochs', type=int, default=4)
    parser.add_argument('--sync-all', type=bool, default=False)

    args = parser.parse_args()
    chrono = MultiStageChrono(skip_obs=10, sync=None)

    device = torch.device('cpu')
    if torch.cuda.is_available() and args.cuda:
        device = torch.device('cuda')

    torch.set_num_threads(args.workers)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # --
    try:
        import torch.backends.cudnn as cudnn
        cudnn.benchmark = True
    except ImportError:
        pass

    # ----
    model = models.__dict__[args.arch]()
    model = model.to(device)

    criterion = nn.CrossEntropyLoss().to(device)

    optimizer = torch.optim.SGD(model.parameters(), args.lr)

    # # ----
    # model, optimizer = amp.initialize(
    #     model,
    #     optimizer,
    #     enabled=args.opt_level != 'O0',
    #     cast_model_type=None,
    #     patch_torch_functions=True,
    #     keep_batchnorm_fp32=None,
    #     master_weights=None,
    #     loss_scale="dynamic",
    #     opt_level=args.opt_level
    # )

    # ----
    train_loader = loaders.load_dataset(args, train=True)

    # dataset is reduced but should be big enough for benchmark!
    batch_iter = iter(train_loader)

    def next_batch(iterator):
        try:
            return next(iterator), iterator
        except StopIteration:
            iterator = iter(train_loader)
            return next(iterator), iterator

    batch_count = len(train_loader)
    if args.prof is not None:
        batch_count = args.prof

    sync_fun = lambda: torch.cuda.current_stream().synchronize()
    sub_syncs = None
    if args.sync_all:
        sub_syncs = sync_fun

    print('Computing...')
    model.train()
    for epoch in range(args.epochs):

        # we sync after batch_count to not slowdown things
        with chrono.time('train', skip_obs=1, sync=sync_fun) as timer:
            for _ in range(batch_count):

                # data loading do not start here so naturally this is not data loading
                # only the time waiting for the data loading to finish
                with chrono.time('loading', sync=sub_syncs):
                    (input, target), batch_iter = next_batch(batch_iter)

                    input = input.to(device)
                    target = target.to(device)

                # if we do not synchronize we only get cuda `launch time`
                # not the actual compute
                with chrono.time('compute', sync=sub_syncs):
                    output = model(input)
                    loss = criterion(output, target)

                    # compute gradient and do SGD step
                    optimizer.zero_grad()

                    # with amp.scale_loss(loss, optimizer) as scaled_loss:
                    #     scaled_loss.backward()

                    loss.backward()
                    optimizer.step()

        print(
            f'[{epoch:3d}/{args.epochs:3d}] ETA: {(args.epochs - epoch - 1) * timer.avg:6.2f} sec'
        )

    print('--')
    print(chrono.to_json(indent=2))
    print('--')
    print(
        f'{(args.batch_size * batch_count) / chrono.chronos["train"].avg:6.2f} Img/sec'
    )
    print('-' * 25)

Exemple #5

0

Afficher le fichier

Fichier : minimal.py Projet : Delaunay/MixedPrecision

            nn.ReLU(),
        )

    print(make_linear_model(4))

    prev = None
    for batch_size in [2, 4, 8, 16, 32, 64, 128]:
        chrono = MultiStageChrono()
        repeat = 30
        exp = f'forward_{batch_size}'

        input = torch.rand(batch_size, 4, 84, 84).cuda()
        net = make_linear_model(batch_size).cuda()

        for _ in range(0, 10):
            with chrono.time(exp):
                for _ in range(0, repeat):
                    net(input)
                torch.cuda.synchronize()

        for _ in range(0, 30):
            with chrono.time(exp):
                for _ in range(0, repeat):
                    net(input)
                torch.cuda.synchronize()

        speed = batch_size * repeat / chrono.chronos[exp].avg
        speed_up = ''
        if prev:
            speed_up = f'Speed up x{speed / prev:7.4f}'