def train_imagenet(): print('==> Preparing data..') img_dim = get_model_property('img_dim') if FLAGS.fake_data: train_dataset_len = 1200000 # Roughly the size of Imagenet dataset. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64)), sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size()) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'train'), transforms.Compose([ transforms.RandomResizedCrop(img_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_dataset_len = len(train_dataset.imgs) resize_dim = max(img_dim, 256) test_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'val'), # Matches Torchvision's eval transforms except Torchvision uses size # 256 resize for all models both here and in the train loader. Their # version crashes during training on 299x299 images, e.g. inception. transforms.Compose([ transforms.Resize(resize_dim), transforms.CenterCrop(img_dim), transforms.ToTensor(), normalize, ])) train_sampler, test_sampler = None, None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.test_set_batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) torch.manual_seed(42) device = xm.xla_device() model = get_model_property('model_fn')().to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) optimizer = optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=1e-4) num_training_steps_per_epoch = train_dataset_len // (FLAGS.batch_size * xm.xrt_world_size()) lr_scheduler = schedulers.wrap_optimizer_with_scheduler( optimizer, scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None), scheduler_divisor=getattr(FLAGS, 'lr_scheduler_divisor', None), scheduler_divide_every_n_epochs=getattr( FLAGS, 'lr_scheduler_divide_every_n_epochs', None), num_steps_per_epoch=num_training_steps_per_epoch, summary_writer=writer) loss_fn = nn.CrossEntropyLoss() def train_loop_fn(loader, epoch): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if lr_scheduler: lr_scheduler.step() if step % FLAGS.log_steps == 0: xm.add_step_closure(_train_update, args=(device, step, loss, tracker, epoch, writer)) def test_loop_fn(loader, epoch): total_samples, correct = 0, 0 model.eval() for step, (data, target) in enumerate(loader): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] if step % FLAGS.log_steps == 0: xm.add_step_closure(test_utils.print_test_update, args=(device, None, epoch, step)) accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, FLAGS.num_epochs + 1): xm.master_print('Epoch {} train begin {}'.format( epoch, test_utils.now())) train_loop_fn(train_device_loader, epoch) xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now())) accuracy = test_loop_fn(test_device_loader, epoch) xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary(writer, epoch, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy
def train_mnist(flags, state_dict): if flags.fake_data: train_loader = xu.SampleGenerator( data=(torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64)), sample_count=60000 // flags.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64)), sample_count=10000 // flags.batch_size // xm.xrt_world_size()) else: train_dataset = datasets.MNIST(os.path.join(flags.datadir, str(xm.get_ordinal())), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) test_dataset = datasets.MNIST(os.path.join(flags.datadir, str(xm.get_ordinal())), train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=flags.batch_size, sampler=train_sampler, drop_last=flags.drop_last, shuffle=False if train_sampler else True, num_workers=flags.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=flags.batch_size, drop_last=flags.drop_last, shuffle=False, num_workers=flags.num_workers) # Scale learning rate to num cores lr = flags.lr * xm.xrt_world_size() device = xm.xla_device() model = MNIST() model.load_state_dict(state_dict) model = model.to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(flags.logdir) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=flags.momentum) loss_fn = nn.NLLLoss() def train_loop_fn(loader): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(flags.batch_size) if step % flags.log_steps == 0: xm.add_step_closure(_train_update, args=(device, step, loss, tracker, writer), run_async=FLAGS.async_closures) def test_loop_fn(loader): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] accuracy = 100.0 * correct.item() / total_samples # accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, flags.num_epochs + 1): xm.master_print('Epoch {} train begin {}'.format( epoch, test_utils.now())) train_loop_fn(train_device_loader) xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now())) accuracy = test_loop_fn(test_device_loader) xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary(writer, epoch, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if flags.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy
def train_mnist(): torch.manual_seed(1) if FLAGS.fake_data: train_dataset_len = 60000 # Number of images in MNIST dataset. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 1, 28, 28), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 1, 28, 28), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=10000 // FLAGS.batch_size // xm.xrt_world_size()) else: train_dataset = datasets.MNIST(FLAGS.datadir, train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_dataset_len = len(train_dataset) test_dataset = datasets.MNIST(FLAGS.datadir, train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = None test_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) devices = (xm.get_xla_supported_devices( max_devices=FLAGS.num_cores) if FLAGS.num_cores != 0 else []) # Scale learning rate to num cores lr = FLAGS.lr * max(len(devices), 1) # Pass [] as device_ids to run using the PyTorch/CPU engine. model_parallel = dp.DataParallel(MNIST, device_ids=devices) def train_loop_fn(model, loader, device, context): loss_fn = nn.NLLLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD( model.parameters(), lr=lr, momentum=FLAGS.momentum)) tracker = xm.RateTracker() model.train() for x, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate()) def test_loop_fn(model, loader, device, context): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() total_samples += data.size()[0] accuracy = 100.0 * correct / total_samples test_utils.print_test_update(device, accuracy) return accuracy accuracy = 0.0 writer = test_utils.get_summary_writer(FLAGS.logdir) num_devices = len( xm.xla_replication_devices(devices)) if len(devices) > 1 else 1 num_training_steps_per_epoch = train_dataset_len // (FLAGS.batch_size * num_devices) for epoch in range(1, FLAGS.num_epochs + 1): model_parallel(train_loop_fn, train_loader) accuracies = model_parallel(test_loop_fn, test_loader) accuracy = mean(accuracies) print('Epoch: {}, Mean Accuracy: {:.2f}%'.format(epoch, accuracy)) global_step = (epoch - 1) * num_training_steps_per_epoch test_utils.add_scalar_to_summary(writer, 'Accuracy/test', accuracy, global_step) if FLAGS.metrics_debug: print(met.metrics_report()) test_utils.close_summary_writer(writer) return accuracy
def train_mnist(FLAGS): DTYPE = torch.float32 torch.manual_seed(1) dims = ( FLAGS.batch_size, 1, 784, ) train_dataset_len = FLAGS.steps_per_epoch if FLAGS.steps_per_epoch else 60000 train_loader = xu.SampleGenerator( data=( torch.ones(dims, dtype=DTYPE,), torch.ones( FLAGS.batch_size, dtype=torch.int64 if not _MSE_LOSS else DTYPE, ), ), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size(), ) test_loader = xu.SampleGenerator( data=( torch.ones(dims, dtype=DTYPE,), torch.ones( FLAGS.batch_size, dtype=torch.int64 if not _MSE_LOSS else DTYPE, ), ), sample_count=10000 // FLAGS.batch_size // xm.xrt_world_size(), ) devices = ( xm.get_xla_supported_devices(max_devices=FLAGS.num_cores) if FLAGS.num_cores != 0 else [] ) """ Non multi-processing """ # Scale learning rate to num cores lr = FLAGS.lr * max(len(devices), 1) model = MNIST(FLAGS) model_parallel = dp.DataParallel( model, device_ids=devices, ) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) # Just some step closure output def train_output_fn(outputs, ctx, args, tracker): if ctx.step > 0 and args.log_steps and ctx.step % args.log_steps == 0: now_time = time.time() if hasattr(ctx, 'start_time') and ctx.start_time: per_step_time = (now_time - ctx.start_time) / ( ctx.step - ctx.last_step_timed ) steps_per_second = 1 / per_step_time print( f'[{xm.get_ordinal()}] Round-trip step time: ' f'{per_step_time} seconds, steps per second: {steps_per_second}' ) if tracker: _train_update( device=device, step=ctx.step, loss=outputs[0], tracker=tracker, epoch=epoch, writer=writer, ) print(f'BEGIN Train step {ctx.step}') ctx.start_time = time.time() ctx.last_step_timed = ctx.step else: ctx.start_time = time.time() ctx.last_step_timed = ctx.step ctx.step += 1 def train_loop_fn(model, loader, device=None, context=None): lr_adder = 0.0 if _MSE_LOSS: loss_fn = nn.MSELoss() else: loss_fn = nn.NLLLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD( model.parameters(), lr=lr + lr_adder, momentum=FLAGS.momentum, ), ) tracker = xm.RateTracker() model.train() def train_inner_loop_fn(batch, ctx): step = ctx.step print(f'Step {step}') data = batch[0] target = batch[1] optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step( optimizer, barrier=False, ) if ( FLAGS.log_steps != 0 and ( FLAGS.log_steps == 1 or (step > 0 and step % FLAGS.log_steps == 0) ) ): xm.add_step_closure( _train_update, args=(device, step, loss, tracker, epoch, writer), ) if step == 0: xm.master_print(f"End TRAIN step {step}") ctx.step += 1 return [loss] step = 0 # Train print('Starting new epoch train loop... (epoch={epoch})') for step, (data, target) in enumerate(loader): if step % FLAGS.step_print_interval == 0: xm.master_print(f"Begin TRAIN Step: {step}") context.step = step if not FLAGS.use_autograph: outputs = train_inner_loop_fn((data, target), context) else: outputs = ptwse.flow.runner.maybe_run_converted( train_inner_loop_fn, (data, target), context ) xm.master_print(f"Saving model...") _save_checkpoint(FLAGS, device, None, model, is_epoch=True) xm.master_print(f"Model saved") def test_loop_fn(model, loader, device, context): print("***********************") print("ENTERING TEST FUNCTION") print("***********************") print('Evaluating...') total_samples = 0 correct = 0 model.eval() for step, (data, target) in enumerate(loader): if step >= FLAGS.test_max_step: break output = model(data) pred = output.max(1, keepdim=True)[1] if FLAGS.mp: correct += pred.eq(target.view_as(pred)).sum() else: correct += pred.eq(target.view_as(pred)).sum().item() total_samples += data.size()[0] if FLAGS.mp: this_accuracy = 100.0 * correct.item() / total_samples print("CALLING: mesh_reduce('test_accuracy')") this_accuracy = xm.mesh_reduce( 'test_accuracy', this_accuracy, np.mean ) print("BACK FROM: mesh_reduce('test_accuracy')") else: this_accuracy = 100.0 * correct / total_samples test_utils.print_test_update(device, this_accuracy) print("***********************") print("LEAVING TEST FUNCTION") print("***********************") return this_accuracy # # Set up for # accuracy = 0.0 num_devices = ( len(xm.xla_replication_devices(devices)) if len(devices) > 1 else 1 ) if not FLAGS.steps_per_epoch: num_training_steps_per_epoch = train_dataset_len // ( FLAGS.batch_size * num_devices ) else: num_training_steps_per_epoch = FLAGS.steps_per_epoch max_accuracy = 0.0 # # Epoch loop # for epoch in range(1, FLAGS.num_epochs + 1): # # Train # device = xm.xla_device() ctx = dp.Context(device=device) ctx.tracker = xm.RateTracker() ctx.step = 0 train_loop_fn(model, train_loader, device, ctx) # # Test # if FLAGS.run_test: with ptwse.scope.proxy_disabled(disabled=FLAGS.test_off_proxy): accuracies = model_parallel(test_loop_fn, test_loader) accuracy = mean(accuracies) print( 'Epoch: {}, Mean Accuracy: {:.2f}%'.format(epoch, accuracy) ) global_step = (epoch - 1) * num_training_steps_per_epoch max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, global_step, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True, ) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy
def train(rank, args): print('enter train @ %s' % (rank), flush=True) args.rank = rank torch.manual_seed(42) tokenizer = get_tokenizer(args) args.vocab_size = tokenizer._tokenizer.get_vocab_size() train_dataset = get_dataset(args) if args.total_num_updates < 100: args.total_num_updates = len(train_dataset) * args.total_num_updates if args.warmup_updates < 1: args.warmup_updates = int(args.total_num_updates * args.warmup_updates) else: args.warmup_updates = int(args.warmup_updates) train_sampler = None if args.gpus: dist.init_process_group('nccl', rank=rank, world_size=args.world_size) if args.gpus > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.gpus, rank=rank, shuffle=False) else: rank = xm.get_ordinal() if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=rank, shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size if not hasattr(train_dataset, '__getbatch__') else None, sampler=train_sampler, pin_memory=True, shuffle=False, num_workers=args.num_workers) eval_loader = None if args.eval_dir: eval_sampler = None if args.gpus: dist.init_process_group('nccl', rank=rank, world_size=args.world_size) if args.gpus > 1: traieval_samplern_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.gpus, rank=rank, shuffle=False) else: rank = xm.get_ordinal() if xm.xrt_world_size() > 1: eval_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=rank, shuffle=False) eval_dataset = get_eval_dataset(args) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.batch_size if not hasattr(train_dataset, '__getbatch__') else None, sampler=eval_sampler, pin_memory=True, shuffle=False, num_workers=args.num_workers) if args.gpus: assert apex_enabled torch.cuda.set_device(rank) ########################## ## ## Model Creation ## ########################## model = get_model(args) model.cuda(rank) device = torch.device('cuda:' + str(rank)) ########################## ## ## Init Optimizer ## ########################## optimizer = apex.optimizers.FusedAdam( model_get_parameters(model, lr=args.lr, lw_lr_decay=args.lw_lr_decay, weight_decay=args.weight_decay), # use this function to set extra optimizer arguments, # see model_get_parameters betas=(0.9, 0.999), eps=1e-6, lr=args.lr, weight_decay=args.weight_decay) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = DDP(model) batches = train_loader else: assert tpu_enabled device = xm.xla_device() ########################## ## ## Model Creation ## ########################## model = get_model(args) ########################## ## ## For shared parameters, TPU requires modules to be tied after .to(device) ## So we first find the shared parameters first ## ########################## shared_parameters = { e[0]: e[1:] for e in _catalog_shared_params(model) } model.to(device) do_share_parameters_again(model, shared_parameters, log=rank == 0) ########################## ## ## Init Optimizer ## ########################## optimizer = optim.Adam( model_get_parameters(model, lr=args.lr, lw_lr_decay=args.lw_lr_decay, weight_decay=args.weight_decay), # use this function to set extra optimizer arguments, # see model_get_parameters lr=args.lr, weight_decay=args.weight_decay) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(args.save_dir) xm.rendezvous("load_checkpoint") # wait for all workers xm.mark_step() # tracker = xm.RateTracker() if args.restore_file: states = torch.load(args.restore_file, map_location=device) for k, v in list(states.items()): if k.startswith('module.'): del states[k] k = k[7:] states[k] = v if k.endswith('position_ids'): del states[k] states[k[:-12] + 'position_embeddings'] = v try: model.load_state_dict(states) except Exception as err: import traceback traceback.print_exc() model.load_state_dict(states, strict=False) model.train() if args.anomaly_detection and rank == 0: torch.set_anomaly_enabled(True) ########################## ## ## Init LR Scheduler ## ########################## scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_updates, num_training_steps=args.total_num_updates, ) step_i = 0 err = None try: if rank == 0: pbar = tqdm(total=args.total_num_updates) while step_i < args.total_num_updates: if not args.gpus: batches = pl.ParallelLoader(train_loader, [device]).per_device_loader(device) for sample in batches: step_i += 1 if step_i > args.total_num_updates: break report_step = step_i % args.log_interval == 0 while True: # the loop only for apex Gradient Overflow optimizer.zero_grad() total_loss, log = get_loss(model, sample, args=args, device=device, gpu=args.gpus, report=report_step) if args.gpus: default_optimizer_step = optimizer.step with amp.scale_loss(total_loss, optimizer) as scaled_loss: scaled_loss.backward() # If Amp detects an overflow, it patches optimizer.step. In other words, if optimizer.step # was left unpatched, there was no overflow, and we don't need to replay. if optimizer.step is default_optimizer_step: optimizer.step() break optimizer.step( ) # If an overflow was detected, "optimizer.step" is the patched call, which does # nothing but restore optimizer.step to default_optimizer_step. if rank == 0: print( "Overflowed, reducing loss scale and replaying batch.", flush=True) else: total_loss.backward() xm.optimizer_step(optimizer) xm.mark_step() break scheduler.step() if report_step: if 'loss' not in log: log['loss'] = total_loss if args.gpus: if rank == 0: pbar.set_description(format_log( log, log_formatter)) else: xm.add_step_closure(_train_update, args=(log, log_formatter)) if args.report_metrics: xm.master_print(met.metrics_report()) if rank == 0: pbar.update(1) if eval_loader is not None: model.eval() if not args.gpus: batches = pl.ParallelLoader(eval_loader, [device]).per_device_loader(device) with torch.no_grad(): record = OrderedDict() for sample in batches: evaluate(model, sample, args=args, device=device, record=record, gpu=args.gpus, report=report_step) post_evaluate(record, args=args) import json print('', flush=True) print(json.dumps(record), flush=True) print('', flush=True) except Exception as _err: err = _err finally: save_fn = os.path.join(args.save_dir, 'checkpoint_final.pt') folder = os.path.split(os.path.abspath(save_fn))[0] os.makedirs(folder, exist_ok=True) if rank == 0 and args.gpus: torch.save(model.state_dict(), save_fn) if err: raise err else: xm.save(model.state_dict(), save_fn) if err: raise err
def train(rank, args): print('enter train @ %s'%(rank), flush=True) args.rank = rank args.split = '' torch.manual_seed(42) save_fn = os.path.join(args.save_dir, 'checkpoint_final.pt') tokenizer = get_tokenizer(args) args.vocab_size = tokenizer._tokenizer.get_vocab_size() if not args.vocab_size else args.vocab_size train_dataset = get_dataset(args) batched_already = hasattr(train_dataset, '__getbatch__') if args.total_num_updates < 100: args.total_num_updates = len(train_dataset) * args.total_num_updates if args.warmup_updates < 1: args.warmup_updates = int(args.total_num_updates * args.warmup_updates) else: args.warmup_updates = int(args.warmup_updates) train_sampler = None if args.gpus: dist.init_process_group( 'nccl', rank=rank, world_size=args.world_size ) if args.gpus > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.gpus, rank=rank, shuffle=args.shuffle) else: rank = xm.get_ordinal() if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=rank, shuffle=args.shuffle) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size if not batched_already else None, sampler=train_sampler, pin_memory=True, shuffle=False, num_workers=args.num_workers) eval_loaders = [] if args.eval_dir: for split in args.splits.split(','): split = split.strip() eval_sampler = None if args.gpus: if args.gpus > 1: eval_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=args.gpus, rank=rank, shuffle=False) else: rank = xm.get_ordinal() if xm.xrt_world_size() > 1: eval_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=rank, shuffle=False) args.split = split eval_dataset = get_eval_dataset(args) eval_loader = torch.utils.data.DataLoader( eval_dataset, batch_size=args.batch_size if not batched_already else None, sampler=eval_sampler, pin_memory=True, shuffle=False, num_workers=args.num_workers) eval_loaders.append(eval_loader) if args.gpus: assert apex_enabled torch.cuda.set_device(rank) ########################## ## ## Model Creation ## ########################## model = get_model(args, tokenizer) model.cuda(rank) device = torch.device('cuda:'+str(rank)) ########################## ## ## Init Optimizer ## ########################## optimizer = apex.optimizers.FusedAdam( model_get_parameters(model, lr=args.lr, lw_lr_decay=args.lw_lr_decay, weight_decay=args.weight_decay, special_layer_wise_lr=args.special_layer_wise_lr, log = rank == 0, ), # use this function to set extra optimizer arguments, # see model_get_parameters betas=(0.9, 0.999), eps=1e-6, lr=args.lr, weight_decay=args.weight_decay ) model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = DDP(model) batches = train_loader else: assert tpu_enabled device = xm.xla_device() ########################## ## ## Model Creation ## ########################## model = get_model(args, tokenizer) ########################## ## ## For shared parameters, TPU requires modules to be tied after .to(device) ## So we first find the shared parameters first ## ########################## shared_parameters = {e[0]: e[1:] for e in _catalog_shared_params(model)} model.to(device) do_share_parameters_again(model, shared_parameters, log = rank == 0) ########################## ## ## Init Optimizer ## ########################## optimizer = optim.Adam( model_get_parameters(model, lr=args.lr, lw_lr_decay=args.lw_lr_decay, weight_decay=args.weight_decay ), # use this function to set extra optimizer arguments, # see model_get_parameters lr=args.lr, weight_decay=args.weight_decay ) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(args.save_dir) xm.rendezvous("load_checkpoint") # wait for all workers xm.mark_step() # tracker = xm.RateTracker() if args.restore_file: states = torch.load(args.restore_file, map_location=device) for k, v in list(states.items()): if k.startswith('module.'): del states[k] k = k[7:] states[k] = v if k.endswith('position_ids'): del states[k] states[k[:-12] + 'position_embeddings'] = v if args.gpus: states = {"module.%s"%k : v for k, v in states.items()} try: model.load_state_dict(states) except Exception as err: import traceback if rank == 0: traceback.print_exc() model.load_state_dict(states, strict=False) if rank == 0: if not os.path.exists(os.path.dirname(save_fn)): try: os.makedirs(os.path.dirname(save_fn)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise if args.gpus: torch.save(model.state_dict(), save_fn ) else: xm.save(model.state_dict(), save_fn ) model.train() if args.anomaly_detection and rank == 0: torch.set_anomaly_enabled(True) ########################## ## ## Init LR Scheduler ## ########################## if not batched_already: args.total_num_updates = args.total_num_updates // args.batch_size args.warmup_updates = args.total_num_updates // args.batch_size args.total_num_updates = args.total_num_updates // args.world_size args.warmup_updates = args.total_num_updates // args.world_size scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_updates, num_training_steps=args.total_num_updates, ) step_i = 0 err = None tb = None #tb = SummaryWriter() try: if rank == 0: pbar = tqdm(total=args.total_num_updates, file=sys.stdout) while step_i < args.total_num_updates: if not args.gpus: batches = pl.ParallelLoader(train_loader, [device]).per_device_loader(device) n_samples = len(batches) for sample in batches: step_i += 1 if step_i > args.total_num_updates: break report_step = step_i % args.log_interval == 0 while True: # the loop only for apex Gradient Overflow optimizer.zero_grad() total_loss, log = get_loss( model, sample, args=args, device=device, gpus=args.gpus, report=report_step ) if args.gpus: default_optimizer_step = optimizer.step with amp.scale_loss(total_loss, optimizer) as scaled_loss: scaled_loss.backward() # If Amp detects an overflow, it patches optimizer.step. In other words, if optimizer.step # was left unpatched, there was no overflow, and we don't need to replay. if optimizer.step is default_optimizer_step: optimizer.step() break optimizer.step() # If an overflow was detected, "optimizer.step" is the patched call, which does # nothing but restore optimizer.step to default_optimizer_step. if rank == 0: print("Overflowed, reducing loss scale and replaying batch.", flush=True) else: total_loss.backward() xm.optimizer_step(optimizer) xm.mark_step() break scheduler.step() if report_step: if 'loss' not in log: log['loss'] = total_loss # tb.add_scalar("Loss", total_loss, step_i) for k, v in log.items(): try: dist.all_reduce(v, op=dist.reduce_op.SUM) log[k] = float(v) except Exception as e: print(v, e) pass if args.gpus: if rank == 0: pbar.set_description(format_log(log, log_formatter, tb, step_i)) else: xm.add_step_closure(_train_update, args=(log, log_formatter, tb, step_i)) if args.report_metrics: xm.master_print(met.metrics_report()) if rank == 0: pbar.update(1) if rank == 0: pbar.close() if eval_loaders: model.half() model.eval() model.cuda() for k, v in model.named_parameters(): v.requires_grad =False for split, eval_loader in zip(args.splits.split(','), eval_loaders): batches = eval_loader if rank == 0: eval_length = len(batches) if not batched_already: eval_length = eval_length // args.batch_size eval_length = eval_length // args.world_size pbar = tqdm(total=eval_length, file=sys.stdout) if not args.gpus: batches = pl.ParallelLoader(eval_loader, [device]).per_device_loader(device) with torch.no_grad(): record = OrderedDict() for sample in batches: evaluate( model, sample, args=args, device=device, record=record, gpus=args.gpus, report=False ) if rank == 0: pbar.update(1) for k, v in record.items(): try: def handle_reduce(v): if len(v.shape) == 0: dist.all_reduce(v, op=dist.reduce_op.SUM) else: L = [torch.ones_like(v) for _ in range(dist.get_world_size())] dist.all_gather(L, v) v = torch.car(L, dim=0) return v if isinstance(v, list): v = [handle_reduce(e) for e in v] else: v = handle_reduce(v) record[k] = float(v) except Exception as e: pass post_evaluate(record, args=args) import json if rank == 0: print('',flush=True) print('Test result for %s'%split, flush=True) print(json.dumps(record, indent=2),flush=True) print('',flush=True) except Exception as _err: err = _err finally: folder = os.path.split(os.path.abspath(save_fn))[0] os.makedirs(folder, exist_ok=True) if rank == 0: print("Saving to %s"%save_fn) if args.gpus: torch.save(model.state_dict(), save_fn ) if err: raise err else: xm.save(model.state_dict(), save_fn ) if err: raise err print("Saved to %s"%save_fn)
def train_cifar(): print('==> Preparing data..') if FLAGS.fake_data: train_dataset_len = 50000 # Number of example in CIFAR train set. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, 32, 32), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, 32, 32), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=10000 // FLAGS.batch_size // xm.xrt_world_size()) else: transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) train_dataset = torchvision.datasets.CIFAR10( root=FLAGS.datadir, train=True, download=True, transform=transform_train) train_dataset_len = len(train_dataset) test_dataset = torchvision.datasets.CIFAR10( root=FLAGS.datadir, train=False, download=True, transform=transform_test) train_sampler = None test_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) torch.manual_seed(42) devices = ( xm.get_xla_supported_devices( max_devices=FLAGS.num_cores) if FLAGS.num_cores != 0 else []) # Pass [] as device_ids to run using the PyTorch/CPU engine. model = torchvision.models.resnet18 if FLAGS.use_torchvision else ResNet18 model_parallel = dp.DataParallel(model, device_ids=devices) def train_loop_fn(model, loader, device, context): loss_fn = nn.CrossEntropyLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD( model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=5e-4)) tracker = xm.RateTracker() model.train() for x, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate()) def test_loop_fn(model, loader, device, context): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() total_samples += data.size()[0] accuracy = 100.0 * correct / total_samples test_utils.print_test_update(device, accuracy) return accuracy accuracy = 0.0 writer = test_utils.get_summary_writer(FLAGS.logdir) num_devices = len( xm.xla_replication_devices(devices)) if len(devices) > 1 else 1 num_training_steps_per_epoch = train_dataset_len // ( FLAGS.batch_size * num_devices) max_accuracy = 0.0 for epoch in range(1, FLAGS.num_epochs + 1): model_parallel(train_loop_fn, train_loader) accuracies = model_parallel(test_loop_fn, test_loader) accuracy = mean(accuracies) max_accuracy = max(accuracy, max_accuracy) print('Epoch: {}, Mean Accuracy: {:.2f}%'.format(epoch, accuracy)) global_step = (epoch - 1) * num_training_steps_per_epoch test_utils.write_to_summary( writer, global_step, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy
def train_imagenet(): print('==> Preparing data..') img_dim = get_model_property('img_dim') if FLAGS.fake_data: train_dataset_len = 1200000 # Roughly the size of Imagenet dataset. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64)), sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size()) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'train'), transforms.Compose([ transforms.RandomResizedCrop(img_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_dataset_len = len(train_dataset.imgs) resize_dim = max(img_dim, 256) test_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'val'), # Matches Torchvision's eval transforms except Torchvision uses size # 256 resize for all models both here and in the train loader. Their # version crashes during training on 299x299 images, e.g. inception. transforms.Compose([ transforms.Resize(resize_dim), transforms.CenterCrop(img_dim), transforms.ToTensor(), normalize, ])) train_sampler, test_sampler = None, None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, persistent_workers=True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.test_set_batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, persistent_workers=True, num_workers=FLAGS.num_workers) torch.manual_seed(42) device = xm.xla_device() model = get_model_property('model_fn')() # Wrap the model with FSDP # You may wrap all, a subset, or none of the sub-modules with inner FSDPs # - to implement ZeRO-2, wrap none of the sub-modules # - to implement ZeRO-3, wrap all of the sub-modules (nested FSDP) # - you may wrap sub-modules at different granularity (e.g. at each resnet # stage or each residual block or each conv layer). fsdp_wrap = lambda m: FSDP(m.to(device), compute_dtype=getattr(torch, FLAGS.compute_dtype ), fp32_reduce_scatter=FLAGS.fp32_reduce_scatter, flatten_parameters=FLAGS.flatten_parameters) # Apply gradient checkpointing to sub-modules if specified grad_ckpt_wrap = checkpoint_module if FLAGS.use_gradient_checkpointing else ( lambda x: x) if FLAGS.use_nested_fsdp: # Here we apply inner FSDP at the level of child modules for ZeRO-3, which # corresponds to different stages in resnet (i.e. Stage 1 to 5). for submodule_name, submodule in model.named_children(): if sum(p.numel() for p in submodule.parameters()) == 0: # Skip those submodules without parameters (i.e. no need to shard them) continue # Note: wrap with `checkpoint_module` first BEFORE wrapping with FSDP m_fsdp = fsdp_wrap(grad_ckpt_wrap(getattr(model, submodule_name))) setattr(model, submodule_name, m_fsdp) # Always wrap the base model with an outer FSDP model = fsdp_wrap(model) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) optimizer = optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=1e-4) num_training_steps_per_epoch = train_dataset_len // (FLAGS.batch_size * xm.xrt_world_size()) lr_scheduler = schedulers.WarmupAndExponentialDecayScheduler( optimizer, num_steps_per_epoch=num_training_steps_per_epoch, divide_every_n_epochs=FLAGS.lr_scheduler_divide_every_n_epochs, divisor=FLAGS.lr_scheduler_divisor, num_warmup_epochs=FLAGS.num_warmup_epochs, summary_writer=writer) loss_fn = nn.CrossEntropyLoss() def train_loop_fn(loader, epoch): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() optimizer.step() # do not reduce gradients on sharded params tracker.add(FLAGS.batch_size) if lr_scheduler: lr_scheduler.step() if step % FLAGS.log_steps == 0: xm.add_step_closure(_train_update, args=(device, step, loss, tracker, epoch, writer)) def test_loop_fn(loader, epoch): total_samples, correct = 0, 0 model.eval() for step, (data, target) in enumerate(loader): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] if step % FLAGS.log_steps == 0: xm.add_step_closure(test_utils.print_test_update, args=(device, None, epoch, step)) accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, FLAGS.num_epochs + 1): xm.master_print('Epoch {} train begin {}'.format( epoch, test_utils.now())) train_loop_fn(train_device_loader, epoch) xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now())) run_eval = ((not FLAGS.test_only_at_end and epoch % FLAGS.eval_interval == 0) or epoch == FLAGS.num_epochs) if run_eval: accuracy = test_loop_fn(test_device_loader, epoch) xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, epoch, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy
def train_imagenet(): print('==> Preparing data..') img_dim = get_model_property('img_dim') if FLAGS.fake_data: train_dataset_len = 1200000 # Roughly the size of Imagenet dataset. train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64)), sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size()) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'train'), transforms.Compose([ transforms.RandomResizedCrop(img_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) train_dataset_len = len(train_dataset.imgs) resize_dim = max(img_dim, 256) test_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, 'val'), # Matches Torchvision's eval transforms except Torchvision uses size # 256 resize for all models both here and in the train loader. Their # version crashes during training on 299x299 images, e.g. inception. transforms.Compose([ transforms.Resize(resize_dim), transforms.CenterCrop(img_dim), transforms.ToTensor(), normalize, ])) train_sampler = None test_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.test_set_batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) torch.manual_seed(42) devices = (xm.get_xla_supported_devices( max_devices=FLAGS.num_cores) if FLAGS.num_cores != 0 else []) # Pass [] as device_ids to run using the PyTorch/CPU engine. torchvision_model = get_model_property('model_fn') model_parallel = dp.DataParallel(torchvision_model, device_ids=devices) def train_loop_fn(model, loader, device, context): loss_fn = nn.CrossEntropyLoss() optimizer = context.getattr_or( 'optimizer', lambda: optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=1e-4)) lr_scheduler = context.getattr_or( 'lr_scheduler', lambda: schedulers.wrap_optimizer_with_scheduler( optimizer, scheduler_type=getattr(FLAGS, 'lr_scheduler_type', None), scheduler_divisor=getattr(FLAGS, 'lr_scheduler_divisor', None), scheduler_divide_every_n_epochs=getattr( FLAGS, 'lr_scheduler_divide_every_n_epochs', None), num_steps_per_epoch=num_training_steps_per_epoch, summary_writer=writer if xm.is_master_ordinal() else None)) tracker = xm.RateTracker() model.train() for x, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: test_utils.print_training_update(device, x, loss.item(), tracker.rate(), tracker.global_rate()) if lr_scheduler: lr_scheduler.step() def test_loop_fn(model, loader, device, context): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() total_samples += data.size()[0] accuracy = 100.0 * correct / total_samples test_utils.print_test_update(device, accuracy) return accuracy accuracy = 0.0 writer = test_utils.get_summary_writer(FLAGS.logdir) num_devices = len( xm.xla_replication_devices(devices)) if len(devices) > 1 else 1 num_training_steps_per_epoch = train_dataset_len // (FLAGS.batch_size * num_devices) for epoch in range(1, FLAGS.num_epochs + 1): model_parallel(train_loop_fn, train_loader) accuracies = model_parallel(test_loop_fn, test_loader) accuracy = mean(accuracies) print('Epoch: {}, Mean Accuracy: {:.2f}%'.format(epoch, accuracy)) global_step = (epoch - 1) * num_training_steps_per_epoch test_utils.add_scalar_to_summary(writer, 'Accuracy/test', accuracy, global_step) if FLAGS.metrics_debug: print(met.metrics_report()) test_utils.close_summary_writer(writer) return accuracy
def train_mnist(): torch.manual_seed(1) if FLAGS.fake_data: train_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 1, 28, 28), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=60000 // FLAGS.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(FLAGS.batch_size, 1, 28, 28), torch.zeros(FLAGS.batch_size, dtype=torch.int64)), sample_count=10000 // FLAGS.batch_size // xm.xrt_world_size()) else: train_dataset = datasets.MNIST(os.path.join(FLAGS.datadir, str(xm.get_ordinal())), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) test_dataset = datasets.MNIST(os.path.join(FLAGS.datadir, str(xm.get_ordinal())), train=False, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, )) ])) train_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.batch_size, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers) # Scale learning rate to num cores lr = FLAGS.lr * xm.xrt_world_size() device = xm.xla_device() model = MNIST().to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=FLAGS.momentum) loss_fn = nn.NLLLoss() def train_loop_fn(loader): tracker = xm.RateTracker() model.train() for x, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) tracker.add(FLAGS.batch_size) if x % FLAGS.log_steps == 0: xm.add_step_closure(_train_update, args=(device, x, loss, tracker)) def test_loop_fn(loader): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum().item() total_samples += data.size()[0] accuracy = 100.0 * correct / total_samples test_utils.print_test_update(device, accuracy) return accuracy accuracy = 0.0 for epoch in range(1, FLAGS.num_epochs + 1): para_loader = pl.ParallelLoader(train_loader, [device]) train_loop_fn(para_loader.per_device_loader(device)) xm.master_print('Finished training epoch {}'.format(epoch)) para_loader = pl.ParallelLoader(test_loader, [device]) accuracy = test_loop_fn(para_loader.per_device_loader(device)) test_utils.add_scalar_to_summary(writer, 'Accuracy/test', accuracy, epoch) if FLAGS.metrics_debug: print(met.metrics_report()) test_utils.close_summary_writer(writer) return accuracy
def train_imagenet(): print("==> Preparing data..") img_dim = get_model_property("img_dim") if FLAGS.fake_data: train_dataset_len = 1200000 # Roughly the size of Imagenet dataset. train_loader = xu.SampleGenerator( data=( torch.zeros(FLAGS.batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.batch_size, dtype=torch.int64), ), sample_count=train_dataset_len // FLAGS.batch_size // xm.xrt_world_size(), ) if FLAGS.validate: test_loader = xu.SampleGenerator( data=( torch.zeros(FLAGS.test_set_batch_size, 3, img_dim, img_dim), torch.zeros(FLAGS.test_set_batch_size, dtype=torch.int64), ), sample_count=50000 // FLAGS.batch_size // xm.xrt_world_size(), ) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, "train"), transforms.Compose([ transforms.RandomResizedCrop(img_dim), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), ) train_dataset_len = len(train_dataset.imgs) resize_dim = max(img_dim, 256) if FLAGS.validate: test_dataset = torchvision.datasets.ImageFolder( os.path.join(FLAGS.datadir, "val"), # Matches Torchvision's eval transforms except Torchvision uses size # 256 resize for all models both here and in the train loader. Their # version crashes during training on 299x299 images, e.g. inception. transforms.Compose([ transforms.Resize(resize_dim), transforms.CenterCrop(img_dim), transforms.ToTensor(), normalize, ]), ) train_sampler, test_sampler = None, None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) if FLAGS.validate: test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=False) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=FLAGS.batch_size, sampler=train_sampler, drop_last=FLAGS.drop_last, shuffle=False if train_sampler else True, num_workers=FLAGS.num_workers, ) if FLAGS.validate: test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=FLAGS.test_set_batch_size, sampler=test_sampler, drop_last=FLAGS.drop_last, shuffle=False, num_workers=FLAGS.num_workers, ) device = xm.xla_device() model = get_model_property("model_fn")().to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(FLAGS.logdir) optimizer = optim.SGD(model.parameters(), lr=FLAGS.lr, momentum=FLAGS.momentum, weight_decay=1e-4) num_training_steps_per_epoch = train_dataset_len // (FLAGS.batch_size * xm.xrt_world_size()) lr_scheduler = schedulers.wrap_optimizer_with_scheduler( optimizer, scheduler_type=getattr(FLAGS, "lr_scheduler_type", None), scheduler_divisor=getattr(FLAGS, "lr_scheduler_divisor", None), scheduler_divide_every_n_epochs=getattr( FLAGS, "lr_scheduler_divide_every_n_epochs", None), num_steps_per_epoch=num_training_steps_per_epoch, summary_writer=writer, ) loss_fn = nn.CrossEntropyLoss() scaler = GradScaler() def train_loop_fn(loader, epoch): if FLAGS.fine_grained_metrics: epoch_start_time = time.time() step_latency_tracker, bwd_latency_tracker, fwd_latency_tracker = [], [], [] else: tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): if FLAGS.fine_grained_metrics: step_start_time = time.time() optimizer.zero_grad() if FLAGS.fine_grained_metrics: fwd_start_time = time.time() with autocast(): output = model(data) loss = loss_fn(output, target) if FLAGS.fine_grained_metrics: fwd_end_time = time.time() fwd_latency = fwd_end_time - fwd_start_time bwd_start_time = time.time() scaler.scale(loss).backward() gradients = xm._fetch_gradients(optimizer) xm.all_reduce("sum", gradients, scale=1.0 / xm.xrt_world_size()) scaler.step(optimizer) scaler.update() xm.mark_step() if lr_scheduler: lr_scheduler.step() if FLAGS.fine_grained_metrics: bwd_end_time = time.time() bwd_latency = bwd_end_time - bwd_start_time step_latency = bwd_end_time - step_start_time step_latency_tracker.append(step_latency) bwd_latency_tracker.append(bwd_latency) fwd_latency_tracker.append(fwd_latency) else: tracker.add(FLAGS.batch_size) if step % FLAGS.log_steps == 0: if FLAGS.fine_grained_metrics: print('FineGrainedMetrics :: Epoch={} Step={} Rate(DataPoints/s)[p50]={:.1f} BatchSize={} Step(s/Batch)[p50]={:.2f} Fwd(s/Batch)[p50]={:.4f} Bwd(s/Batch)[p50]={:.4f}'.format(\ epoch, step, FLAGS.batch_size/p50(step_latency_tracker), FLAGS.batch_size, p50(step_latency_tracker), p50(bwd_latency_tracker), p50(fwd_latency_tracker))) else: # _train_update(device, step, loss, tracker, epoch, writer) xm.add_step_closure(_train_update, args=(device, step, loss, tracker, epoch, writer)) if FLAGS.fine_grained_metrics: epoch_end_time = time.time() epoch_latency = epoch_end_time - epoch_start_time print('FineGrainedMetrics :: Epoch={} Epoch(s)={:.} Rate(DataPoints/s)[p50]={:.1f} BatchSize={} Step(s/Batch)[p50]={:.2f} Fwd(s/Batch)[p50]={:.4f} Bwd(s/Batch)[p50]={:.4f}'.format(\ epoch, epoch_latency, FLAGS.batch_size/p50(step_latency_tracker), FLAGS.batch_size, p50(step_latency_tracker), p50(bwd_latency_tracker), p50(fwd_latency_tracker))) def test_loop_fn(loader, epoch): total_samples, correct = 0, 0 model.eval() for step, (data, target) in enumerate(loader): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] if step % FLAGS.log_steps == 0: test_utils.print_test_update(device, None, epoch, step) # xm.add_step_closure(test_utils.print_test_update, args=(device, None, epoch, step)) accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce("test_accuracy", accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) if FLAGS.validate: test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, FLAGS.num_epochs + 1): xm.master_print("Epoch {} train begin {}".format( epoch, test_utils.now())) train_loop_fn(train_device_loader, epoch) xm.master_print("Epoch {} train end {}".format(epoch, test_utils.now())) if FLAGS.validate: accuracy = test_loop_fn(test_device_loader, epoch) xm.master_print("Epoch {} test end {}, Accuracy={:.2f}".format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, epoch, dict_to_write={"Accuracy/test": accuracy}, write_xla_metrics=True) if FLAGS.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) if FLAGS.validate: xm.master_print("Max Accuracy: {:.2f}%".format(max_accuracy)) return max_accuracy if FLAGS.validate else None
def train_mnist(flags, training_started=None, dynamic_graph=False, fetch_often=False): torch.manual_seed(1) if flags.fake_data: train_loader = xu.SampleGenerator( data=( torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64), ), sample_count=600000 // flags.batch_size // xm.xrt_world_size(), ) test_loader = xu.SampleGenerator( data=( torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64), ), sample_count=100000 // flags.batch_size // xm.xrt_world_size(), ) else: train_dataset = datasets.MNIST( os.path.join(flags.datadir, str(xm.get_ordinal())), train=True, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) test_dataset = datasets.MNIST( os.path.join(flags.datadir, str(xm.get_ordinal())), train=False, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))] ), ) train_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True ) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=flags.batch_size, sampler=train_sampler, drop_last=flags.drop_last, shuffle=False if train_sampler else True, num_workers=flags.num_workers, ) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=flags.batch_size, drop_last=flags.drop_last, shuffle=False, num_workers=flags.num_workers, ) # Scale learning rate to num cores lr = flags.lr * xm.xrt_world_size() device = xm.xla_device() model = MNIST().to(device) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(flags.logdir) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=flags.momentum) loss_fn = nn.NLLLoss() server = xp.start_server(flags.profiler_port) def train_loop_fn(loader): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): if dynamic_graph: # testing purpose only: dynamic batch size and graph. index = max(-step, -flags.batch_size + 1) # non-empty data, target = data[:-index, :, :, :], target[:-index] if step >= 15 and training_started: # testing purpose only: set event for synchronization. training_started.set() with xp.StepTrace("train_mnist", step_num=step): with xp.Trace("build_graph"): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() xm.optimizer_step(optimizer) if fetch_often: # testing purpose only: fetch XLA tensors to CPU. loss_i = loss.item() tracker.add(flags.batch_size) if step % flags.log_steps == 0: xm.add_step_closure(_train_update, args=(device, step, loss, tracker, writer)) def test_loop_fn(loader): total_samples = 0 correct = 0 model.eval() for data, target in loader: with xp.StepTrace("test_mnist"): output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce("test_accuracy", accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, flags.num_epochs + 1): xm.master_print("Epoch {} train begin {}".format(epoch, test_utils.now())) train_loop_fn(train_device_loader) xm.master_print("Epoch {} train end {}".format(epoch, test_utils.now())) accuracy = test_loop_fn(test_device_loader) xm.master_print( "Epoch {} test end {}, Accuracy={:.2f}".format(epoch, test_utils.now(), accuracy) ) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, epoch, dict_to_write={"Accuracy/test": accuracy}, write_xla_metrics=True ) if flags.metrics_debug: xm.master_print(met.metrics_report()) test_utils.close_summary_writer(writer) xm.master_print("Max Accuracy: {:.2f}%".format(max_accuracy)) return max_accuracy
def train_mnist(flags, **kwargs): torch.manual_seed(1) if flags.fake_data: train_loader = xu.SampleGenerator( data=(torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64)), sample_count=60000 // flags.batch_size // xm.xrt_world_size()) test_loader = xu.SampleGenerator( data=(torch.zeros(flags.batch_size, 1, 28, 28), torch.zeros(flags.batch_size, dtype=torch.int64)), sample_count=10000 // flags.batch_size // xm.xrt_world_size()) else: train_dataset = datasets.MNIST( os.path.join(flags.datadir, str(xm.get_ordinal())), train=True, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])) test_dataset = datasets.MNIST( os.path.join(flags.datadir, str(xm.get_ordinal())), train=False, download=True, transform=transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])) train_sampler = None if xm.xrt_world_size() > 1: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal(), shuffle=True) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=flags.batch_size, sampler=train_sampler, drop_last=flags.drop_last, shuffle=False if train_sampler else True, num_workers=flags.num_workers) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=flags.batch_size, drop_last=flags.drop_last, shuffle=False, num_workers=flags.num_workers) # Scale learning rate to num cores lr = flags.lr * xm.xrt_world_size() device = xm.xla_device() model = MNIST() # Wrap the model with FSDP fsdp_wrap = lambda m: FSDP( m.to(device), compute_dtype=getattr(torch, flags.compute_dtype), fp32_reduce_scatter=flags.fp32_reduce_scatter, flatten_parameters=flags.flatten_parameters) # Apply gradient checkpointing to sub-modules if specified grad_ckpt_wrap = checkpoint_module if flags.use_gradient_checkpointing else ( lambda x: x) if flags.use_nested_fsdp: # Wrap a few sub-modules with inner FSDP (to implement ZeRO-3) # Note: wrap with `checkpoint_module` first BEFORE wrapping with FSDP model.conv1 = fsdp_wrap(grad_ckpt_wrap(model.conv1)) model.conv2 = fsdp_wrap(grad_ckpt_wrap(model.conv2)) model.fc1 = fsdp_wrap(grad_ckpt_wrap(model.fc1)) model.fc2 = fsdp_wrap(grad_ckpt_wrap(model.fc2)) # Always wrap the base model with an outer FSDP model = fsdp_wrap(model) writer = None if xm.is_master_ordinal(): writer = test_utils.get_summary_writer(flags.logdir) optimizer = optim.SGD(model.parameters(), lr=lr, momentum=flags.momentum) loss_fn = nn.NLLLoss() def train_loop_fn(model, loader): tracker = xm.RateTracker() model.train() for step, (data, target) in enumerate(loader): optimizer.zero_grad() output = model(data) loss = loss_fn(output, target) loss.backward() optimizer.step() # do not reduce gradients on sharded params tracker.add(flags.batch_size) if step % flags.log_steps == 0: xm.add_step_closure( _train_update, args=(device, step, loss, tracker, writer), run_async=FLAGS.async_closures) def test_loop_fn(model, loader): total_samples = 0 correct = 0 model.eval() for data, target in loader: output = model(data) pred = output.max(1, keepdim=True)[1] correct += pred.eq(target.view_as(pred)).sum() total_samples += data.size()[0] accuracy = 100.0 * correct.item() / total_samples accuracy = xm.mesh_reduce('test_accuracy', accuracy, np.mean) return accuracy train_device_loader = pl.MpDeviceLoader(train_loader, device) test_device_loader = pl.MpDeviceLoader(test_loader, device) accuracy, max_accuracy = 0.0, 0.0 for epoch in range(1, flags.num_epochs + 1): xm.master_print('Epoch {} train begin {}'.format(epoch, test_utils.now())) train_loop_fn(model, train_device_loader) xm.master_print('Epoch {} train end {}'.format(epoch, test_utils.now())) accuracy = test_loop_fn(model, test_device_loader) xm.master_print('Epoch {} test end {}, Accuracy={:.2f}'.format( epoch, test_utils.now(), accuracy)) max_accuracy = max(accuracy, max_accuracy) test_utils.write_to_summary( writer, epoch, dict_to_write={'Accuracy/test': accuracy}, write_xla_metrics=True) if flags.metrics_debug: xm.master_print(met.metrics_report()) if flags.ckpt_consolidation: # Note: to run this test, all the model checkpoints needs to be # accessible from the master rank. Set --ckpt_prefix to a shared file # system (e.g. NFS) when running on a TPU pod. # Save the final model checkpoint rank = xm.get_ordinal() world_size = xm.xrt_world_size() ckpt_path = f'{flags.ckpt_prefix}_rank-{rank:08d}-of-{world_size:08d}.pth' ckpt = { 'model': model.state_dict(), 'shard_metadata': model.get_shard_metadata(), 'optimizer': optimizer.state_dict(), # not needed in ckpt consolidation } os.makedirs(os.path.dirname(ckpt_path), exist_ok=True) xm.save(ckpt, ckpt_path, master_only=False) print(f'checkpoint saved to {ckpt_path}\n', end='') # Consolidate the sharded model checkpoints and test its accuracy if xm.is_master_ordinal(local=False): consolidate_sharded_model_checkpoints( ckpt_prefix=flags.ckpt_prefix, ckpt_suffix="_rank-*-of-*.pth") xm.rendezvous('ckpt_consolidation') model = MNIST().to(device) ckpt_consolidated = torch.load(f'{flags.ckpt_prefix}_consolidated.pth') model.load_state_dict(ckpt_consolidated['model']) accuracy = test_loop_fn(model, test_device_loader) xm.master_print( f'Checkpoint consolidated, Accuracy={accuracy:.2f} ' '(note: it can be slightly different from the final training accuracy ' 'due to non-sync BatchNorm2d in the model)') test_utils.close_summary_writer(writer) xm.master_print('Max Accuracy: {:.2f}%'.format(max_accuracy)) return max_accuracy