def run_check(spec, repeat=10, number=20, report_name=None): chrono = MultiStageChrono(skip_obs=2) args = spec['args'] input_gen = spec['inputs'] algos = spec['algos'] batch_sizes = spec['batch_size'] get_output_layer = spec['get_output_layer'] get_output_size = spec['get_output_size'] for algo, tensor_sizes in algos: for arg in args: # initialize the conv layer that we will benchmark layer = algo(**arg).cuda() for batch_size in batch_sizes: for tensor_size in tensor_sizes: name = f'algo={algo.__name__},batch={batch_size},tensor={tensor_size},arg={arg}' print(name) try: input = input_gen(layer, batch_size, tensor_size) target = None size = None criterion = nn.MSELoss() # Benchmark the layer for i in range(0, repeat): # --- with chrono.time(name) as timer: for _ in range(0, number): out = layer(*input) out = get_output_layer(*out) if target is None: if get_output_size is None: size = reduce(mul, out.shape[1:]) else: size = get_output_size(out.shape) target = torch.randn(batch_size, size).cuda() loss = criterion(target, out.view(-1, size)) loss.backward() torch.cuda.synchronize() print(f' Ran in {timer.avg:5.2f}s {timer.avg * repeat:5.2f}s') # --- except Exception as e: print(f'[!] > {e}') print(traceback.format_exc()) report = chrono.to_json(indent=2) print(report) if report_name is not None: json.dump(report, open(report_name, 'w'), indent=2)
def train(models, epochs, dataset, olr, lr_reset_threshold=1e-05, output_name='/tmp/', device_name='gpu'): device = torch.device(device_name) train_loader = torch.utils.data.DataLoader(batch_size=64, shuffle=True, num_workers=4, dataset=dataset) if torch.cuda.is_available(): nd = torch.cuda.device_count() devices = [ torch.device(f'cuda:{i}') for i in range(torch.cuda.device_count()) ] else: nd = 1 devices = [torch.device('cpu')] dataset_size = len(train_loader) models_optim = {} for name, model in models.items(): model = model.to(device) optimizer = WindowedSGD(model.parameters(), epoch_steps=dataset_size, window=dataset_size, lr_min=lr_reset_threshold, lr=olr) model.train() models_optim[name] = (model, optimizer) epoch_time = MultiStageChrono(name='train', skip_obs=10) costs = [] print('Start training') for e in range(0, epochs): all_cost = [0] * len(models_optim) with epoch_time.time('epoch') as step_time: for batch_idx, (data, target) in enumerate(train_loader): with epoch_time.time('models'): for mid, (name, (model, optimizer)) in enumerate(models_optim.items()): device = devices[mid % nd] if torch.cuda.is_available(): torch.cuda.set_device(device) # g1, torch.float32, True, False) data = data.to(device, torch.float, True, True) target = target.to(device, torch.long, True, True) model = model.to(device) with epoch_time.time(model): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() all_cost[mid] += loss.item() optimizer.step(loss) if torch.cuda.is_available(): torch.cuda.synchronize() # --- # --- # --- # --- with epoch_time.time('check_point'): for name, (model, _) in models_optim.items(): torch.save(model.state_dict(), f'{output_name}/{name}_{e}') infos = [ f'{all_cost[idx]:8.2f}, {models_optim[name][1].lr:10.8f}' for idx, name in enumerate(models_optim) ] print(f'{e:3d}/{epochs:3d}, {step_time.val:6.2f}, ' + ', '.join(infos)) costs.append(all_cost) print(epoch_time.to_json()) return costs
def main(): # ---- parser = argparse.ArgumentParser() parser.add_argument('--data', metavar='DIR', help='path to dataset') parser.add_argument('--arch', '-a', metavar='ARCH', default='resnet18') parser.add_argument('--lr', '--learning-rate', default=0.1, type=float, metavar='LR') parser.add_argument('--opt-level', type=str) parser.add_argument('--cuda', action='store_true', default=True, dest='cuda') parser.add_argument('--no-cuda', action='store_false', dest='cuda') parser.add_argument('--batch-size', type=int, default=128) parser.add_argument('--loader', type=str, default='torch') parser.add_argument('--prof', type=int, default=None) parser.add_argument('--workers', type=int, default=4) parser.add_argument('--seed', type=int, default=4) parser.add_argument('--epochs', type=int, default=4) parser.add_argument('--sync-all', type=bool, default=False) args = parser.parse_args() chrono = MultiStageChrono(skip_obs=10, sync=None) device = torch.device('cpu') if torch.cuda.is_available() and args.cuda: device = torch.device('cuda') torch.set_num_threads(args.workers) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # -- try: import torch.backends.cudnn as cudnn cudnn.benchmark = True except ImportError: pass # ---- model = models.__dict__[args.arch]() model = model.to(device) criterion = nn.CrossEntropyLoss().to(device) optimizer = torch.optim.SGD(model.parameters(), args.lr) # # ---- # model, optimizer = amp.initialize( # model, # optimizer, # enabled=args.opt_level != 'O0', # cast_model_type=None, # patch_torch_functions=True, # keep_batchnorm_fp32=None, # master_weights=None, # loss_scale="dynamic", # opt_level=args.opt_level # ) # ---- train_loader = loaders.load_dataset(args, train=True) # dataset is reduced but should be big enough for benchmark! batch_iter = iter(train_loader) def next_batch(iterator): try: return next(iterator), iterator except StopIteration: iterator = iter(train_loader) return next(iterator), iterator batch_count = len(train_loader) if args.prof is not None: batch_count = args.prof sync_fun = lambda: torch.cuda.current_stream().synchronize() sub_syncs = None if args.sync_all: sub_syncs = sync_fun print('Computing...') model.train() for epoch in range(args.epochs): # we sync after batch_count to not slowdown things with chrono.time('train', skip_obs=1, sync=sync_fun) as timer: for _ in range(batch_count): # data loading do not start here so naturally this is not data loading # only the time waiting for the data loading to finish with chrono.time('loading', sync=sub_syncs): (input, target), batch_iter = next_batch(batch_iter) input = input.to(device) target = target.to(device) # if we do not synchronize we only get cuda `launch time` # not the actual compute with chrono.time('compute', sync=sub_syncs): output = model(input) loss = criterion(output, target) # compute gradient and do SGD step optimizer.zero_grad() # with amp.scale_loss(loss, optimizer) as scaled_loss: # scaled_loss.backward() loss.backward() optimizer.step() print( f'[{epoch:3d}/{args.epochs:3d}] ETA: {(args.epochs - epoch - 1) * timer.avg:6.2f} sec' ) print('--') print(chrono.to_json(indent=2)) print('--') print( f'{(args.batch_size * batch_count) / chrono.chronos["train"].avg:6.2f} Img/sec' ) print('-' * 25)