def setup_training(args): assert (torch.cuda.is_available()) if args.local_rank == -1: device = torch.device("cuda") args.n_gpu = torch.cuda.device_count() args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='smddp', init_method='env://') args.n_gpu = 1 if args.gradient_accumulation_steps == 1: args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) print( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError(" `do_train` must be True.") if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and any( [i.startswith('ckpt') for i in os.listdir(args.output_dir)])): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def setup_training(args): assert (torch.cuda.is_available()) global ort_supplement import ort_supplement.ort_supplement as ort_supplement device = ort_supplement.setup_onnxruntime_with_mpi(args) if is_main_process(): dllogger.init(backends=[ dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step) ]) else: dllogger.init(backends=[]) print( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, args.n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if not args.do_train: raise ValueError(" `do_train` must be True.") if not args.resume_from_checkpoint and os.path.exists( args.output_dir) and (os.listdir(args.output_dir) and any( [i.startswith('ckpt') for i in os.listdir(args.output_dir)])): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def setup_training(args): assert torch.cuda.is_available() if args.smp > 0: # Initialize SMP. The configuration is obtained from the parameters passed to # the Sagemaker PyTorch estimator. smp.init() # SMP: Set the device to the GPU ID used by the current process. # Input tensors should be transferred to this device. torch.cuda.set_device(smp.local_rank()) device = torch.device("cuda", smp.local_rank()) args.n_gpu = 1 # if args.local_rank == -1: # device = torch.device("cuda") # args.n_gpu = torch.cuda.device_count() # args.allreduce_post_accumulation = False # args.allreduce_post_accumulation_fp16 = False # else: # torch.cuda.set_device(args.local_rank) # device = torch.device("cuda", args.local_rank) # # Initializes the distributed backend which will take care of sychronizing nodes/GPUs # torch.distributed.init_process_group(backend='nccl', init_method='env://') # args.n_gpu = 1 if args.gradient_accumulation_steps == 1: args.allreduce_post_accumulation = False args.allreduce_post_accumulation_fp16 = False print( "device: {} n_gpu: {}, mp_rank: {}, rank: {}, distributed training: {}, 16-bits training: {}" .format(device, args.n_gpu, smp.mp_rank(), smp.rank(), bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) if args.train_batch_size % args.gradient_accumulation_steps != 0: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible" .format(args.gradient_accumulation_steps, args.train_batch_size)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps if (not args.resume_from_checkpoint and os.path.exists(args.output_dir) and (os.listdir(args.output_dir) and any([i.startswith("ckpt") for i in os.listdir(args.output_dir)]))): raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if (not args.resume_from_checkpoint or not os.path.exists(args.output_dir)) and is_main_process(): os.makedirs(args.output_dir, exist_ok=True) return device, args
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) header = 'Epoch: [{}]'.format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1. / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] loss_dict = model(images, targets) # applying logging only in the main process # ### OUR CODE ### if utils.is_main_process(): # let's track the losses here by adding scalars tensorboard.logger.add_scalar_dict( # passing the dictionary of losses (pairs - loss_key: loss_value) loss_dict, # passing the global step (number of iterations) global_step=tensorboard.global_iter, # adding the tag to combine plots in a subgroup tag="loss") # incrementing the global step (number of iterations) tensorboard.global_iter += 1 # ### END OF OUR CODE ### losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) return metric_logger
def take_optimizer_step(args, optimizer, model, overflow_buf, global_step): global skipped_steps if args.allreduce_post_accumulation: # manually allreduce gradients after all accumulation steps # check for Inf/NaN # 1. allocate an uninitialized buffer for flattened gradient loss_scale = _amp_state.loss_scalers[0].loss_scale() if args.fp16 else 1 master_grads = [p.grad for p in amp.master_params(optimizer) if p.grad is not None] flat_grad_size = sum(p.numel() for p in master_grads) allreduce_dtype = torch.float16 if args.allreduce_post_accumulation_fp16 else torch.float32 flat_raw = torch.empty(flat_grad_size, device='cuda', dtype=allreduce_dtype) # 2. combine unflattening and predivision of unscaled 'raw' gradient allreduced_views = apex_C.unflatten(flat_raw, master_grads) overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [master_grads, allreduced_views], loss_scale / (get_world_size() * args.gradient_accumulation_steps)) # 3. sum gradient across ranks. Because of the predivision, this averages the gradient torch.distributed.all_reduce(flat_raw) # 4. combine unscaling and unflattening of allreduced gradient overflow_buf.zero_() amp_C.multi_tensor_scale(65536, overflow_buf, [allreduced_views, master_grads], 1./loss_scale) # 5. update loss scale if args.fp16: scaler = _amp_state.loss_scalers[0] old_overflow_buf = scaler._overflow_buf scaler._overflow_buf = overflow_buf had_overflow = scaler.update_scale() scaler._overfloat_buf = old_overflow_buf else: had_overflow = 0 # 6. call optimizer step function if had_overflow == 0: optimizer.step() global_step += 1 else: # Overflow detected, print message and clear gradients skipped_steps += 1 if is_main_process(): scaler = _amp_state.loss_scalers[0] dllogger.log(step="PARAMETER", data={"loss_scale": scaler.loss_scale()}) if _amp_state.opt_properties.master_weights: for param in optimizer._amp_stash.all_fp32_from_fp16_params: param.grad = None for param in model.parameters(): param.grad = None else: if args.apply_optimizer > 0: optimizer.step() # optimizer.zero_grad() for param in model.parameters(): param.grad = None global_step += 1 return global_step
def checkpoint_step(args, epoch, global_step, model, optimizer, grad_scaler, last3_checkpoint_paths) : torch.cuda.synchronize() if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'grad_scaler': grad_scaler.state_dict(), 'epoch': epoch}, output_save_file) # The new checkpoint could have a name already in # last3_checkpoint_paths. In this case, torch.save will overwrite # the old file; thus, we need to take the name out of # last3_checkpoint_paths and append it to the last. if output_save_file in last3_checkpoint_paths: last3_checkpoint_paths.remove(output_save_file) last3_checkpoint_paths.append(output_save_file) if len(last3_checkpoint_paths) > 3: ckpt_to_be_removed = last3_checkpoint_paths.pop(0) os.remove(ckpt_to_be_removed)
def get_train_features(data_dir, bert_model, max_seq_length, do_lower_case, local_rank, train_batch_size, gradient_accumulation_steps, num_train_epochs, tokenizer, processor): cached_train_features_file = os.path.join( data_dir, '{0}_{1}_{2}'.format( list(filter(None, bert_model.split('/'))).pop(), str(max_seq_length), str(do_lower_case), ), ) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) logger.info("Loaded pre-processed features from {}".format( cached_train_features_file)) except: logger.info("Did not find pre-processed features from {}".format( cached_train_features_file)) train_examples = processor.get_train_examples(data_dir) train_features, _ = convert_examples_to_features( train_examples, processor.get_labels(), max_seq_length, tokenizer, ) if is_main_process(): logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) return train_features
def voc_evaluate(model, data_loader, device): n_threads = torch.get_num_threads() torch.set_num_threads(1) cpu_device = torch.device("cpu") model.eval() metric_logger = utils.MetricLogger(delimiter=" ") header = 'Test:' all_boxes = [[] for i in range(21)] image_index = [] for image, targets in metric_logger.log_every(data_loader, 100, header): image = list(img.to(device) for img in image) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] torch.cuda.synchronize() model_time = time.time() outputs = model(image) name = ''.join([chr(i) for i in targets[0]['name'].tolist()]) image_index.append(name) outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] image_boxes = [[] for i in range(21)] for o in outputs: for i in range(o['boxes'].shape[0]): image_boxes[o['labels'][i]].extend([ torch.cat([o['boxes'][i],o['scores'][i].unsqueeze(0)], dim=0) ]) #makes sure that the all_boxes is filled with empty array when #there are no boxes in image_boxes for i in range(21): if image_boxes[i] != []: all_boxes[i].append([torch.stack(image_boxes[i])]) else: all_boxes[i].append([]) model_time = time.time() - model_time metric_logger.synchronize_between_processes() all_boxes_gathered = utils.all_gather(all_boxes) image_index_gathered = utils.all_gather(image_index) # results from all processes are gathered here if utils.is_main_process(): all_boxes = [[] for i in range(21)] for abgs in all_boxes_gathered: for ab,abg in zip(all_boxes,abgs): ab += abg image_index = [] for iig in image_index_gathered: image_index+=iig _write_voc_results_file(all_boxes,image_index, data_loader.dataset.root, data_loader.dataset._transforms.transforms[0].CLASSES) _do_python_eval(data_loader) torch.set_num_threads(n_threads)
def _should_plot(self, epoch, iteration, total_iterations): if not utils.is_main_process() or not self.plot_freq: return False if epoch % self.plot_freq == 0 or epoch == self.start_epoch + self.epochs - 1: period = max(total_iterations // 10, 1) if iteration % period == 0: return True return False
def maybe_load(self): self.global_step = None self.f_id = None self.files = None checkpoint = None if chio.exists(self.args.output_dir): model_names = [f for f in chio.list( self.args.output_dir) if f.endswith(".pt.{}".format(self.team))] if len(model_names) != 0: self.args.resume_step = max( [int(x.split( '.pt.{}'.format(self.team))[0].split('_')[1].strip()) for x in model_names]) self.global_step = self.args.resume_step if self.global_step is not None: print("Load from {}".format(os.path.join(self.args.output_dir, "ckpt_{}.pt.{}".format( self.global_step, self.team)))) with chio.open(os.path.join(self.args.output_dir, "ckpt_{}.pt.{}".format( self.global_step, self.team)), "rb") as f: checkpoint = torch.load(f, map_location="cpu") self.model.load_state_dict(checkpoint['model'], strict=False) self.another_model.load_state_dict( checkpoint['another_model'], strict=False) if self.args.phase2: self.global_step -= self.args.phase1_end_step if is_main_process(): print("resume step from ", self.args.resume_step) if self.args.phase2: keys = list(checkpoint['optimizer']['state'].keys()) # Override hyperparameters from Phase 1 for key in keys: checkpoint['optimizer']['state'][key]['step'] = \ self.global_step for iter, item in enumerate( checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter]['t_total'] =\ self.args.max_steps checkpoint['optimizer']['param_groups'][iter]['warmup'] = \ self.args.warmup_proportion checkpoint['optimizer']['param_groups'][iter]['lr'] = \ self.args.learning_rate self.optimizer.load_state_dict(checkpoint['optimizer']) # Restore AMP master parameters self.f_id = checkpoint['files'][0] self.files = checkpoint['files'][1:]
def write_scalars(self, scalars, global_step, name=''): if not utils.is_main_process(): return if name: scalars = { f'{tag}/{name}': value for tag, value in scalars.items() } for tag, value in scalars.items(): self.writer.add_scalar(tag, value, global_step)
def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, writer): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter("lr", utils.SmoothedValue(window_size=1, fmt="{value:.6f}")) header = "Epoch: [{}]".format(epoch) lr_scheduler = None if epoch == 0: warmup_factor = 1.0 / 1000 warmup_iters = min(1000, len(data_loader) - 1) lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor) step = 0 for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) targets = [ {k: v.to(device) for k, v in t.items() if k in ["boxes", "labels", "area", "iscrowd"]} for t in targets ] loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) loss_value = losses_reduced.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() optimizer.step() if lr_scheduler is not None: lr_scheduler.step() metric_logger.update(loss=losses_reduced, **loss_dict_reduced) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) if is_main_process(): writer.add_scalar( "lr", optimizer.param_groups[0]["lr"], epoch * len(data_loader) + step ) step += 1 return metric_logger
def prepare_model(args, device): # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) model = modeling.BertForPreTraining(config) criterion = BertPretrainingCriterion(config.vocab_size, args.train_batch_size, args.max_seq_length) model.enable_apex(False) model = bert_model_with_loss(model, criterion) model = ort_supplement.create_ort_trainer(args, device, model) checkpoint = None if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1 and not args.init_checkpoint: model_names = [ f for f in os.listdir(args.output_dir) if f.endswith(".pt") ] args.resume_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) global_step = args.resume_step if not args.init_checkpoint else 0 if not args.init_checkpoint: checkpoint = torch.load(os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") else: checkpoint = torch.load(args.init_checkpoint, map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) if args.phase2 and not args.init_checkpoint: global_step -= args.phase1_end_step if is_main_process(): print("resume step from ", args.resume_step) return model, checkpoint, global_step
def _setup_output(self, output_dir, out_file, overwrite=False): self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self.out_file = None if utils.is_main_process(): if overwrite: for filename in self.output_dir.iterdir(): if filename.is_dir(): shutil.rmtree(filename) elif filename.is_file(): filename.unlink() if out_file: self.out_file = (self.output_dir / 'train.txt') self.writer = SummaryWriter(output_dir)
def __new__(cls, config=None) -> Any: # this is a SUPER TRICKY HACK to work with singleton in multiprocessing (enable num_workers > 0 on windows) # it exploits the IMPL DETAIL - this class will not be constructed in the spawn threads # (the multiprocessing is used only in pytorch dataloader which do not create such object) # if you know the less terrible and devastating way to write this, please share if config is None and not is_main_process() and is_os_windows(): return object.__new__( cls) # we do not need this object (from IMPL DETAIL) # this is a singleton - once initialized (in the __init__ of experiment) it is available from everywhere if cls._instance is None: assert config is not None, "maybe you used relative module import of this file? don't do this" cls._instance = object.__new__(cls) else: assert config is None return cls._instance
def create_checkpoint(self, model, optimizer, epoch, lr_scheduler): if not utils.is_main_process(): return if isinstance(model, (nn.DataParallel, nn.parallel.DistributedDataParallel)): model_state_dict = model.module.state_dict() else: model_state_dict = model.state_dict() torch.save( { 'epoch': epoch, 'model_state_dict': model_state_dict, 'optimizer_state_dict': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict() }, self.output_dir / f'checkpoint{epoch:03}.tar')
def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, device, epoch, print_freq): model.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value}')) header = 'Epoch: [{}]'.format(epoch) for data, target in metric_logger.log_every(data_loader, print_freq, header): global n_iter n_iter = n_iter + 1 optimizer.zero_grad() target = target.to(device) output = model(data) loss = criterion(output, target) loss = loss.mean() #visualization segmap = torch.argmax(output['out'], dim=1) loss.backward() optimizer.step() lr_scheduler.step() metric_logger.update(loss=loss.item(), lr=optimizer.param_groups[0]["lr"]) if n_iter % args.print_freq == 0: if args.tensorboard and utils.is_main_process(): args.writer.add_scalar('SupLoss', loss.item(), n_iter) if n_iter % (args.print_freq * 100) == 0: grid = torchvision.utils.make_grid(data[:1]) grid = (grid - grid.min()) / (grid.max() - grid.min() + 1e-5) args.writer.add_image('sup images', grid, n_iter) segmap = args.colormap[segmap[0].detach().cpu().numpy()] segmap = segmap / 255. args.writer.add_image('sup segmaps', segmap.transpose((2, 0, 1)), n_iter)
def __init__(self, lr=0.02, momentum=0.9, weight_decay=1e-4, lr_steps=None, lr_gamma=0.1, data_path='.', output_dir='.', out_file=False, batch_size=32, device='cpu', epochs=1, num_workers=4, dist_url='env://', print_freq=100, plot_freq=None, data_parallel=False, overwrite=False): self._setup_output(output_dir, out_file, overwrite) self.lr = lr self.momentum = momentum self.weight_decay = weight_decay self.lr_steps = lr_steps self.lr_gamma = lr_gamma self.plot_freq = plot_freq if utils.is_main_process() else None self.print_freq = print_freq self.epochs = epochs self.data_path = data_path self.batch_size = batch_size self.num_workers = num_workers self.start_epoch = 0 self.dist_url = dist_url device_index = self._init_distributed_mode() self.device = torch.device(f'{device}:{device_index}') self.data_parallel = data_parallel assert not (self.data_parallel and self.distributed ), 'use either DataParallel or DistributedDataParallel' self.checkpoint = infer_checkpoint(self.output_dir)
def accumulate_predictions_from_multiple_gpus(predictions_per_gpu): all_predictions = all_gather(predictions_per_gpu) if not is_main_process(): return # merge the list of dicts predictions = {} for p in all_predictions: predictions.update(p) # convert a dict where the key is the index in a list image_ids = list(sorted(predictions.keys())) if len(image_ids) != image_ids[-1] + 1: logger = logging.getLogger("RetinaNet.inference") logger.warning( "Number of images that were gathered from multiple processes is not " "a contiguous set. Some images might be missing from the evaluation" ) # convert to a list predictions = [predictions[i] for i in image_ids] return predictions
def record_hparams(self, metrics=None): if not metrics: metrics = {} if not utils.is_main_process(): return hparams_dict = { 'optimizer': 'SGD', 'lr': self.lr, 'momentum': self.momentum, 'weight_decay': self.weight_decay, 'gamma': self.lr_gamma, 'bsize': self.batch_size * self.world_size, } if self.lr_steps: hparams_dict['lr_steps'] = ', '.join(str(s) for s in self.lr_steps) metrics_dict = { f'hparam/{name}': value for name, value in metrics.items() } self.writer.add_hparams(hparams_dict, metrics_dict)
def inference(model, data_loader, dataset_name, device='cuda', output_folder=None, expected_results=(), expected_results_sigma_tol=4): device = torch.device(device) num_devices = get_world_size() logger = logging.getLogger("RetinaNet.inference") dataset = data_loader.dataset logger.info("Start evaluation on {} dataset({} images).".format(dataset_name, len(dataset))) total_timer = Timer() inference_timer = Timer() total_timer.tic() predictions = compute_on_dataset(model, data_loader, device, inference_timer) # wait for all processes to complete before measuring the time synchronize() total_time = total_timer.toc() total_time_str = get_time_str(total_time) logger.info( "Total run time: {} ({} s / img per device, on {} devices)".format( total_time_str, total_time * num_devices / len(dataset), num_devices ) ) predictions = accumulate_predictions_from_multiple_gpus(predictions) if not is_main_process(): return if output_folder: torch.save(predictions, os.path.join(output_folder, "predictions.pth")) extra_args = dict( expected_results=expected_results, expected_results_sigma_tol=expected_results_sigma_tol, ) return evaluate(dataset=dataset, predictions=predictions, output_folder=output_folder, **extra_args)
def main(): # args = parse_arguments() # del args.local_rank # print(args) # args_to_yaml(args, 'config_finetune_train_glue_mrpc.yaml') # exit(0) config_yaml, local_rank = parse_my_arguments() args = args_from_yaml(config_yaml) args.local_rank = local_rank """ Experiment Setup """ if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train: print( "WARNING: Output directory ({}) already exists and is not empty.". format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) processors = { "cola": ColaProcessor, "mnli": MnliProcessor, "mrpc": MrpcProcessor, } num_labels_task = { "cola": 2, "mnli": 3, "mrpc": 2, } task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % task_name) processor = processors[task_name]() num_labels = num_labels_task[task_name] label_list = processor.get_labels() tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) train_examples = None num_train_optimization_steps = None if args.do_train: train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) """ Prepare Model """ # Prepare model cache_dir = args.cache_dir if args.cache_dir else os.path.join( PYTORCH_PRETRAINED_BERT_CACHE, 'distributed_{}'.format( args.local_rank)) model = BertForSequenceClassification.from_pretrained( args.bert_model, cache_dir=cache_dir, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get( 'model', state_dict ) # in a full checkpoint weights are saved in state_dict['model'] model.load_state_dict(state_dict, strict=False) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) plain_model = getattr(model, 'module', model) with open(args.sparsity_config, 'r') as f: raw_dict = yaml.load(f, Loader=yaml.SafeLoader) masks = dict.fromkeys(raw_dict['prune_ratios'].keys()) for param_name in list(masks.keys()): if get_parameter_by_name(plain_model, param_name) is None: print(f'[WARNING] Cannot find {param_name}') del masks[param_name] for param_name in masks: param = get_parameter_by_name(plain_model, param_name) non_zero_mask = torch.ne(param, 0).to(param.dtype) masks[param_name] = non_zero_mask """ Prepare Optimizer""" # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.fp16_utils.fp16_optimizer import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: """ Prepare Dataset """ train_features = convert_examples_to_features(train_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) """ Training Loop """ model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): if args.max_steps > 0 and global_step > args.max_steps: break batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, label_ids = batch loss = model(input_ids, segment_ids, input_mask, label_ids) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 plain_model = getattr(model, 'module', model) for param_name, mask in masks.items(): get_parameter_by_name(plain_model, param_name).data *= mask """ Load Model for Evaluation """ if args.do_train: # Save a trained model and the associated configuration output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) if is_main_process( ): # only the main process should save the trained model model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self torch.save(model_to_save.state_dict(), output_model_file) with open(output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) # Load a trained model and config that you have fine-tuned config = BertConfig(output_config_file) model = BertForSequenceClassification(config, num_labels=num_labels) model.load_state_dict(torch.load(output_model_file)) else: model = BertForSequenceClassification.from_pretrained( args.bert_model, num_labels=num_labels) state_dict = torch.load(args.init_checkpoint, map_location='cpu') state_dict = state_dict.get('model', state_dict) model.load_state_dict(state_dict, strict=False) model.to(device) """ Run Evaluation """ if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for input_ids, input_mask, segment_ids, label_ids in tqdm( eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids) logits = model(input_ids, segment_ids, input_mask) logits = logits.detach().cpu().numpy() label_ids = label_ids.to('cpu').numpy() tmp_eval_accuracy = accuracy(logits, label_ids) eval_loss += tmp_eval_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'global_step': global_step, 'loss': loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def main(args): utils.init_distributed_mode(args) print(args) if args.distillation_type != 'none' and args.finetune and not args.eval: raise NotImplementedError( "Finetuning with distillation not yet supported") device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if True: # args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_train = torch.utils.data.RandomSampler(dataset_train) sampler_val = torch.utils.data.SequentialSampler(dataset_val) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, sampler=sampler_val, batch_size=int( 1.5 * args.batch_size), num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating model: {args.model}") model = create_model( args.model, pretrained=False, num_classes=args.nb_classes, drop_rate=args.drop, drop_path_rate=args.drop_path, drop_block_rate=None, ) if args.finetune: if args.finetune.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.finetune, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.finetune, map_location='cpu') checkpoint_model = checkpoint['model'] state_dict = model.state_dict() for k in [ 'head.weight', 'head.bias', 'head_dist.weight', 'head_dist.bias' ]: if k in checkpoint_model and checkpoint_model[ k].shape != state_dict[k].shape: print(f"Removing key {k} from pretrained checkpoint") del checkpoint_model[k] # interpolate position embedding pos_embed_checkpoint = checkpoint_model['pos_embed'] embedding_size = pos_embed_checkpoint.shape[-1] num_patches = model.patch_embed.num_patches num_extra_tokens = model.pos_embed.shape[-2] - num_patches # height (== width) for the checkpoint position embedding orig_size = int( (pos_embed_checkpoint.shape[-2] - num_extra_tokens)**0.5) # height (== width) for the new position embedding new_size = int(num_patches**0.5) # class_token and dist_token are kept unchanged extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens] # only the position tokens are interpolated pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:] pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2) pos_tokens = torch.nn.functional.interpolate(pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False) pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2) new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1) checkpoint_model['pos_embed'] = new_pos_embed model.load_state_dict(checkpoint_model, strict=False) model.to(device) model_ema = None if args.model_ema: # Important to create EMA model after cuda(), DP wrapper, and AMP but before SyncBN and DDP wrapper model_ema = ModelEma(model, decay=args.model_ema_decay, device='cpu' if args.model_ema_force_cpu else '', resume='') model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() teacher_model = None if args.distillation_type != 'none': assert args.teacher_path, 'need to specify teacher-path when using distillation' print(f"Creating teacher model: {args.teacher_model}") teacher_model = create_model( args.teacher_model, pretrained=False, num_classes=args.nb_classes, global_pool='avg', ) if args.teacher_path.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.teacher_path, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.teacher_path, map_location='cpu') teacher_model.load_state_dict(checkpoint['model']) teacher_model.to(device) teacher_model.eval() # wrap the criterion in our custom DistillationLoss, which # just dispatches to the original criterion if args.distillation_type is 'none' criterion = DistillationLoss(criterion, teacher_model, args.distillation_type, args.distillation_alpha, args.distillation_tau) output_dir = Path(args.output_dir) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print(f"Start training for {args.epochs} epochs") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, set_training_mode=args.finetune == '' # keep in eval mode during finetuning ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / ('checkpoint_%04d.pth' % (epoch))] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) if not args.train_without_eval: test_stats = evaluate(data_loader_val, model, device) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } else: log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
input_transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize([.485, .456, .406], [.229, .224, .225]), ]) data_kwargs = { 'base_size': args.base_size, 'crop_size': args.crop_size, 'transform': input_transform } val_dataset = get_segmentation_dataset(args.dataset, split=args.split, mode=args.mode, **data_kwargs) sampler = make_data_sampler(val_dataset, False, distributed) batch_sampler = data.BatchSampler(sampler=sampler, batch_size=args.batch_size, drop_last=False) val_data = data.DataLoader(val_dataset, shuffle=False, batch_sampler=batch_sampler, num_workers=args.num_workers) metric = SegmentationMetric(val_dataset.num_class) metric = validate(model, val_data, metric, device) ptutil.synchronize() pixAcc, mIoU = ptutil.accumulate_metric(metric) if ptutil.is_main_process(): print('pixAcc: %.4f, mIoU: %.4f' % (pixAcc, mIoU))
def eval_linear(args): utils.init_distributed_mode(args) print("git:\n {}\n".format(utils.get_sha())) print("\n".join("%s: %s" % (k, str(v)) for k, v in sorted(dict(vars(args)).items()))) cudnn.benchmark = True # ============ preparing data ... ============ train_transform = pth_transforms.Compose([ pth_transforms.RandomResizedCrop(224), pth_transforms.RandomHorizontalFlip(), pth_transforms.ToTensor(), pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) val_transform = pth_transforms.Compose([ pth_transforms.Resize(256, interpolation=3), pth_transforms.CenterCrop(224), pth_transforms.ToTensor(), pth_transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) dataset_train = datasets.ImageFolder(os.path.join(args.data_path, "train"), transform=train_transform) dataset_val = datasets.ImageFolder(os.path.join(args.data_path, "val"), transform=val_transform) sampler = torch.utils.data.distributed.DistributedSampler(dataset_train) train_loader = torch.utils.data.DataLoader( dataset_train, sampler=sampler, batch_size=args.batch_size_per_gpu, num_workers=args.num_workers, pin_memory=True, ) val_loader = torch.utils.data.DataLoader( dataset_val, batch_size=args.batch_size_per_gpu, num_workers=args.num_workers, pin_memory=True, ) print(f"Data loaded with {len(dataset_train)} train and {len(dataset_val)} val imgs.") # ============ building network ... ============ model = vits.__dict__[args.arch](patch_size=args.patch_size, num_classes=0) model.cuda() model.eval() print(f"Model {args.arch} {args.patch_size}x{args.patch_size} built.") # load weights to evaluate utils.load_pretrained_weights(model, args.pretrained_weights, args.checkpoint_key, args.arch, args.patch_size) linear_classifier = LinearClassifier(model.embed_dim * (args.n_last_blocks + int(args.avgpool_patchtokens)), num_labels=args.num_labels) linear_classifier = linear_classifier.cuda() linear_classifier = nn.parallel.DistributedDataParallel(linear_classifier, device_ids=[args.gpu]) # set optimizer optimizer = torch.optim.SGD( linear_classifier.parameters(), args.lr * (args.batch_size_per_gpu * utils.get_world_size()) / 256., # linear scaling rule momentum=0.9, weight_decay=0, # we do not apply weight decay ) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.epochs, eta_min=0) # Optionally resume from a checkpoint to_restore = {"epoch": 0, "best_acc": 0.} utils.restart_from_checkpoint( os.path.join(args.output_dir, "checkpoint.pth.tar"), run_variables=to_restore, state_dict=linear_classifier, optimizer=optimizer, scheduler=scheduler, ) start_epoch = to_restore["epoch"] best_acc = to_restore["best_acc"] for epoch in range(start_epoch, args.epochs): train_loader.sampler.set_epoch(epoch) train_stats = train(model, linear_classifier, optimizer, train_loader, epoch, args.n_last_blocks, args.avgpool_patchtokens) scheduler.step() log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, 'epoch': epoch} if epoch % args.val_freq == 0 or epoch == args.epochs - 1: test_stats = validate_network(val_loader, model, linear_classifier, args.n_last_blocks, args.avgpool_patchtokens) print(f"Accuracy at epoch {epoch} of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%") best_acc = max(best_acc, test_stats["acc1"]) print(f'Max accuracy so far: {best_acc:.2f}%') log_stats = {**{k: v for k, v in log_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}} if utils.is_main_process(): with (Path(args.output_dir) / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") save_dict = { "epoch": epoch + 1, "state_dict": linear_classifier.state_dict(), "optimizer": optimizer.state_dict(), "scheduler": scheduler.state_dict(), "best_acc": best_acc, } torch.save(save_dict, os.path.join(args.output_dir, "checkpoint.pth.tar")) print("Training of the supervised linear classifier on frozen features completed.\n" "Top-1 test accuracy: {acc:.1f}".format(acc=best_acc))
def main(): args = parse_args() hvd.init() set_affinity(hvd.local_rank()) if is_main_process(): log("Running total processes: {}".format(get_world_size())) log("Starting process: {}".format(get_rank())) if is_main_process(): dllogger.init(backends=[dllogger.JSONStreamBackend(verbosity=dllogger.Verbosity.VERBOSE, filename=args.json_summary), dllogger.StdOutBackend(verbosity=dllogger.Verbosity.VERBOSE, step_format=format_step)]) else: dllogger.init(backends=[]) tf.random.set_seed(args.seed) dllogger.log(step="PARAMETER", data={"SEED": args.seed}) # script parameters BATCH_SIZE = args.train_batch_size EVAL_BATCH_SIZE = args.predict_batch_size USE_XLA = args.xla USE_AMP = args.amp EPOCHS = args.num_train_epochs if not args.do_train: EPOCHS = args.num_train_epochs = 1 log("Since running inference only, setting args.num_train_epochs to 1") if not os.path.exists(args.output_dir) and is_main_process(): os.makedirs(args.output_dir) # TensorFlow configuration gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'GPU') tf.config.optimizer.set_jit(USE_XLA) #tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) if args.amp: policy = tf.keras.mixed_precision.experimental.Policy("mixed_float16", loss_scale="dynamic") tf.keras.mixed_precision.experimental.set_policy(policy) print('Compute dtype: %s' % policy.compute_dtype) # Compute dtype: float16 print('Variable dtype: %s' % policy.variable_dtype) # Variable dtype: float32 if is_main_process(): log("***** Loading tokenizer and model *****") # Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression) electra_model = args.electra_model config = ElectraConfig.from_pretrained(electra_model, cache_dir=args.cache_dir) config.update({"amp": args.amp}) if args.vocab_file is None: tokenizer = ElectraTokenizer.from_pretrained(electra_model, cache_dir=args.cache_dir) else: tokenizer = ElectraTokenizer( vocab_file=args.vocab_file, do_lower_case=args.do_lower_case) model = TFElectraForQuestionAnswering.from_pretrained(electra_model, config=config, cache_dir=args.cache_dir, args=args) if is_main_process(): log("***** Loading dataset *****") # Load data processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() train_examples = processor.get_train_examples(args.data_dir) if args.do_train else None dev_examples = processor.get_dev_examples(args.data_dir) if args.do_predict else None if is_main_process(): log("***** Loading features *****") # Load cached features squad_version = '2.0' if args.version_2_with_negative else '1.1' if args.cache_dir is None: args.cache_dir = args.data_dir cached_train_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_train-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) cached_dev_features_file = args.cache_dir.rstrip('/') + '/' + 'TF2_dev-v{4}.json_{1}_{2}_{3}'.format( electra_model.split("/")[1], str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length), squad_version) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) if args.do_train else [] with open(cached_dev_features_file, "rb") as reader: dev_features = pickle.load(reader) if args.do_predict else [] except: train_features = ( # TODO: (yy) do on rank 0? squad_convert_examples_to_features( examples=train_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=True, return_dataset="", ) if args.do_train else [] ) dev_features = ( squad_convert_examples_to_features( examples=dev_examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=False, return_dataset="", ) if args.do_predict else [] ) # Dump Cached features if not args.skip_cache and is_main_process(): if args.do_train: log("***** Building Cache Files: {} *****".format(cached_train_features_file)) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) if args.do_predict: log("***** Building Cache Files: {} *****".format(cached_dev_features_file)) with open(cached_dev_features_file, "wb") as writer: pickle.dump(dev_features, writer) len_train_features = len(train_features) total_train_steps = int((len_train_features * EPOCHS / BATCH_SIZE) / get_world_size()) + 1 train_steps_per_epoch = int((len_train_features / BATCH_SIZE) / get_world_size()) + 1 len_dev_features = len(dev_features) total_dev_steps = int((len_dev_features / EVAL_BATCH_SIZE)) + 1 train_dataset = get_dataset_from_features(train_features, BATCH_SIZE, v2=args.version_2_with_negative) if args.do_train else [] dev_dataset = get_dataset_from_features(dev_features, EVAL_BATCH_SIZE, drop_remainder=False, ngpu=1, mode="dev", v2=args.version_2_with_negative) if args.do_predict else [] opt = create_optimizer(init_lr=args.learning_rate, num_train_steps=total_train_steps, num_warmup_steps=int(args.warmup_proportion * total_train_steps), weight_decay_rate=args.weight_decay_rate, layerwise_lr_decay=args.layerwise_lr_decay, n_transformer_layers=model.num_hidden_layers) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, "dynamic") # Define loss function loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) loss_class = tf.keras.losses.BinaryCrossentropy( from_logits=True, name='binary_crossentropy' ) metric = tf.keras.metrics.SparseCategoricalAccuracy("accuracy") model.compile(optimizer=opt, loss=loss, metrics=[metric]) train_loss_results = [] if args.do_train and is_main_process(): log("***** Running training *****") log(" Num examples = ", len_train_features) log(" Num Epochs = ", args.num_train_epochs) log(" Instantaneous batch size per GPU = ", args.train_batch_size) log( " Total train batch size (w. parallel, distributed & accumulation) = ", args.train_batch_size * get_world_size(), ) log(" Total optimization steps =", total_train_steps) total_train_time = 0 latency = [] for epoch in range(EPOCHS): if args.do_train: epoch_loss_avg = tf.keras.metrics.Mean() epoch_perf_avg = tf.keras.metrics.Mean() epoch_start = time.time() epoch_iterator = tqdm(train_dataset, total=train_steps_per_epoch, desc="Iteration", mininterval=5, disable=not is_main_process()) for iter, inputs in enumerate(epoch_iterator): # breaking criterion if max_steps if > 1 if args.max_steps > 0 and (epoch * train_steps_per_epoch + iter) > args.max_steps: break iter_start = time.time() # Optimize the model loss_value = train_step(model, inputs, loss, USE_AMP, opt, (iter == 0 and epoch == 0), v2=args.version_2_with_negative, loss_class=loss_class, fp16=USE_AMP) epoch_perf_avg.update_state(1. * BATCH_SIZE / (time.time() - iter_start)) if iter % args.log_freq == 0: if is_main_process(): log("\nEpoch: {:03d}, Step:{:6d}, Loss:{:12.8f}, Perf:{:5.0f}, loss_scale:{}, opt_step:{}".format(epoch, iter, loss_value, epoch_perf_avg.result() * get_world_size(), opt.loss_scale if config.amp else 1, int(opt.iterations))) dllogger.log(step=(epoch, iter,), data={"step_loss": float(loss_value.numpy()), "train_perf": float( epoch_perf_avg.result().numpy() * get_world_size())}) # Track progress epoch_loss_avg.update_state(loss_value) # Add current batch loss # End epoch train_loss_results.append(epoch_loss_avg.result()) total_train_time += float(time.time() - epoch_start) # Summarize and save checkpoint at the end of each epoch if is_main_process(): dllogger.log(step=tuple(), data={"e2e_train_time": total_train_time, "training_sequences_per_second": float( epoch_perf_avg.result().numpy() * get_world_size()), "final_loss": float(epoch_loss_avg.result().numpy())}) if not args.skip_checkpoint: if args.ci: checkpoint_name = "{}/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.output_dir, args.version_2_with_negative, epoch + 1) else: checkpoint_name = "checkpoints/electra_base_qa_v2_{}_epoch_{}_ckpt".format(args.version_2_with_negative, epoch + 1) if is_main_process(): model.save_weights(checkpoint_name) if args.do_predict and (args.evaluate_during_training or epoch == args.num_train_epochs - 1): if not args.do_train: log("***** Loading checkpoint: {} *****".format(args.init_checkpoint)) model.load_weights(args.init_checkpoint).expect_partial() current_feature_id = 0 all_results = [] if is_main_process(): log("***** Running evaluation *****") log(" Num Batches = ", total_dev_steps) log(" Batch size = ", args.predict_batch_size) raw_infer_start = time.time() if is_main_process(): infer_perf_avg = tf.keras.metrics.Mean() dev_iterator = tqdm(dev_dataset, total=total_dev_steps, desc="Iteration", mininterval=5, disable=not is_main_process()) for input_ids, input_mask, segment_ids, start_positions, end_positions, cls_index, p_mask, is_impossible in dev_iterator: # training=False is needed only if there are layers with different # behavior during training versus inference (e.g. Dropout). iter_start = time.time() if not args.joint_head: batch_start_logits, batch_end_logits = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, )[:2] #Synchronize with GPU to compute time _ = batch_start_logits.numpy() else: outputs = infer_step(model, input_ids, attention_mask=input_mask, token_type_ids=segment_ids, cls_index=cls_index, p_mask=p_mask, ) #Synchronize with GPU to compute time _ = outputs[0].numpy() infer_time = (time.time() - iter_start) infer_perf_avg.update_state(1. * EVAL_BATCH_SIZE / infer_time) latency.append(infer_time) for iter_ in range(input_ids.shape[0]): if not args.joint_head: start_logits = batch_start_logits[iter_].numpy().tolist() end_logits = batch_end_logits[iter_].numpy().tolist() dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) all_results.append(RawResult(unique_id=unique_id, start_logits=start_logits, end_logits=end_logits)) else: dev_feature = dev_features[current_feature_id] current_feature_id += 1 unique_id = int(dev_feature.unique_id) output = [output[iter_].numpy().tolist() for output in outputs] start_logits = output[0] start_top_index = output[1] end_logits = output[2] end_top_index = output[3] cls_logits = output[4] result = SquadResult( unique_id, start_logits, end_logits, start_top_index=start_top_index, end_top_index=end_top_index, cls_logits=cls_logits, ) all_results.append(result) # Compute and save predictions answers, nbest_answers = get_answers(dev_examples, dev_features, all_results, args) output_prediction_file = os.path.join(args.output_dir, "predictions.json") output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json") e2e_infer_time = time.time() - raw_infer_start # if args.version_2_with_negative: # output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json") # else: # output_null_log_odds_file = None with open(output_prediction_file, "w") as f: f.write(json.dumps(answers, indent=4) + "\n") with open(output_nbest_file, "w") as f: f.write(json.dumps(nbest_answers, indent=4) + "\n") if args.do_eval: if args.version_2_with_negative: dev_file = "dev-v2.0.json" else: dev_file = "dev-v1.1.json" eval_out = subprocess.check_output([sys.executable, args.eval_script, args.data_dir + "/" + dev_file, output_prediction_file]) log(eval_out.decode('UTF-8')) scores = str(eval_out).strip() exact_match = float(scores.split(":")[1].split(",")[0]) if args.version_2_with_negative: f1 = float(scores.split(":")[2].split(",")[0]) else: f1 = float(scores.split(":")[2].split("}")[0]) log("Epoch: {:03d} Results: {}".format(epoch, eval_out.decode('UTF-8'))) log("**EVAL SUMMARY** - Epoch: {:03d}, EM: {:6.3f}, F1: {:6.3f}, Infer_Perf: {:4.0f} seq/s" .format(epoch, exact_match, f1, infer_perf_avg.result())) latency_all = sorted(latency)[:-2] log( "**LATENCY SUMMARY** - Epoch: {:03d}, Ave: {:6.3f} ms, 90%: {:6.3f} ms, 95%: {:6.3f} ms, 99%: {:6.3f} ms" .format(epoch, sum(latency_all) / len(latency_all) * 1000, sum(latency_all[:int(len(latency_all) * 0.9)]) / int(len(latency_all) * 0.9) * 1000, sum(latency_all[:int(len(latency_all) * 0.95)]) / int(len(latency_all) * 0.95) * 1000, sum(latency_all[:int(len(latency_all) * 0.99)]) / int(len(latency_all) * 0.99) * 1000, )) dllogger.log(step=tuple(), data={"inference_sequences_per_second": float(infer_perf_avg.result().numpy()), "e2e_inference_time": e2e_infer_time}) if is_main_process() and args.do_train and args.do_eval: log( "**RESULTS SUMMARY** - EM: {:6.3f}, F1: {:6.3f}, Train_Time: {:4.0f} s, Train_Perf: {:4.0f} seq/s, Infer_Perf: {:4.0f} seq/s" .format(exact_match, f1, total_train_time, epoch_perf_avg.result() * get_world_size(), infer_perf_avg.result())) dllogger.log(step=tuple(), data={"exact_match": exact_match, "F1": f1})
def main(args): utils.init_distributed_mode(args) update_config_from_file(args.cfg) print(args) args_text = yaml.safe_dump(args.__dict__, default_flow_style=False) device = torch.device(args.device) # fix the seed for reproducibility seed = args.seed + utils.get_rank() torch.manual_seed(seed) np.random.seed(seed) # random.seed(seed) cudnn.benchmark = True dataset_train, args.nb_classes = build_dataset(is_train=True, args=args) dataset_val, _ = build_dataset(is_train=False, args=args) if args.distributed: num_tasks = utils.get_world_size() global_rank = utils.get_rank() if args.repeated_aug: sampler_train = RASampler(dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) else: sampler_train = torch.utils.data.DistributedSampler( dataset_train, num_replicas=num_tasks, rank=global_rank, shuffle=True) if args.dist_eval: if len(dataset_val) % num_tasks != 0: print( 'Warning: Enabling distributed evaluation with an eval dataset not divisible by process number. ' 'This will slightly alter validation results as extra duplicate entries are added to achieve ' 'equal num of samples per-process.') sampler_val = torch.utils.data.DistributedSampler( dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) else: sampler_val = torch.utils.data.SequentialSampler(dataset_val) sampler_train = torch.utils.data.RandomSampler(dataset_train) data_loader_train = torch.utils.data.DataLoader( dataset_train, sampler=sampler_train, batch_size=args.batch_size, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=True, ) data_loader_val = torch.utils.data.DataLoader(dataset_val, batch_size=int( 2 * args.batch_size), sampler=sampler_val, num_workers=args.num_workers, pin_memory=args.pin_mem, drop_last=False) mixup_fn = None mixup_active = args.mixup > 0 or args.cutmix > 0. or args.cutmix_minmax is not None if mixup_active: mixup_fn = Mixup(mixup_alpha=args.mixup, cutmix_alpha=args.cutmix, cutmix_minmax=args.cutmix_minmax, prob=args.mixup_prob, switch_prob=args.mixup_switch_prob, mode=args.mixup_mode, label_smoothing=args.smoothing, num_classes=args.nb_classes) print(f"Creating SuperVisionTransformer") print(cfg) model = Vision_TransformerSuper( img_size=args.input_size, patch_size=args.patch_size, embed_dim=cfg.SUPERNET.EMBED_DIM, depth=cfg.SUPERNET.DEPTH, num_heads=cfg.SUPERNET.NUM_HEADS, mlp_ratio=cfg.SUPERNET.MLP_RATIO, qkv_bias=True, drop_rate=args.drop, drop_path_rate=args.drop_path, gp=args.gp, num_classes=args.nb_classes, max_relative_position=args.max_relative_position, relative_position=args.relative_position, change_qkv=args.change_qkv, abs_pos=not args.no_abs_pos) choices = { 'num_heads': cfg.SEARCH_SPACE.NUM_HEADS, 'mlp_ratio': cfg.SEARCH_SPACE.MLP_RATIO, 'embed_dim': cfg.SEARCH_SPACE.EMBED_DIM, 'depth': cfg.SEARCH_SPACE.DEPTH } model.to(device) if args.teacher_model: teacher_model = create_model( args.teacher_model, pretrained=True, num_classes=args.nb_classes, ) teacher_model.to(device) teacher_loss = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: teacher_model = None teacher_loss = None model_ema = None model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu], find_unused_parameters=True) model_without_ddp = model.module n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad) print('number of params:', n_parameters) linear_scaled_lr = args.lr * args.batch_size * utils.get_world_size( ) / 512.0 args.lr = linear_scaled_lr optimizer = create_optimizer(args, model_without_ddp) loss_scaler = NativeScaler() lr_scheduler, _ = create_scheduler(args, optimizer) # criterion = LabelSmoothingCrossEntropy() if args.mixup > 0.: # smoothing is handled with mixup label transform criterion = SoftTargetCrossEntropy() elif args.smoothing: criterion = LabelSmoothingCrossEntropy(smoothing=args.smoothing) else: criterion = torch.nn.CrossEntropyLoss() output_dir = Path(args.output_dir) if not output_dir.exists(): output_dir.mkdir(parents=True) # save config for later experiments with open(output_dir / "config.yaml", 'w') as f: f.write(args_text) if args.resume: if args.resume.startswith('https'): checkpoint = torch.hub.load_state_dict_from_url(args.resume, map_location='cpu', check_hash=True) else: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint: optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) args.start_epoch = checkpoint['epoch'] + 1 if 'scaler' in checkpoint: loss_scaler.load_state_dict(checkpoint['scaler']) if args.model_ema: utils._load_checkpoint_for_ema(model_ema, checkpoint['model_ema']) retrain_config = None if args.mode == 'retrain' and "RETRAIN" in cfg: retrain_config = { 'layer_num': cfg.RETRAIN.DEPTH, 'embed_dim': [cfg.RETRAIN.EMBED_DIM] * cfg.RETRAIN.DEPTH, 'num_heads': cfg.RETRAIN.NUM_HEADS, 'mlp_ratio': cfg.RETRAIN.MLP_RATIO } if args.eval: print(retrain_config) test_stats = evaluate(data_loader_val, model, device, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) return print("Start training") start_time = time.time() max_accuracy = 0.0 for epoch in range(args.start_epoch, args.epochs): if args.distributed: data_loader_train.sampler.set_epoch(epoch) train_stats = train_one_epoch( model, criterion, data_loader_train, optimizer, device, epoch, loss_scaler, args.clip_grad, model_ema, mixup_fn, amp=args.amp, teacher_model=teacher_model, teach_loss=teacher_loss, choices=choices, mode=args.mode, retrain_config=retrain_config, ) lr_scheduler.step(epoch) if args.output_dir: checkpoint_paths = [output_dir / 'checkpoint.pth'] for checkpoint_path in checkpoint_paths: utils.save_on_master( { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'epoch': epoch, # 'model_ema': get_state_dict(model_ema), 'scaler': loss_scaler.state_dict(), 'args': args, }, checkpoint_path) test_stats = evaluate(data_loader_val, model, device, amp=args.amp, choices=choices, mode=args.mode, retrain_config=retrain_config) print( f"Accuracy of the network on the {len(dataset_val)} test images: {test_stats['acc1']:.1f}%" ) max_accuracy = max(max_accuracy, test_stats["acc1"]) print(f'Max accuracy: {max_accuracy:.2f}%') log_stats = { **{f'train_{k}': v for k, v in train_stats.items()}, **{f'test_{k}': v for k, v in test_stats.items()}, 'epoch': epoch, 'n_parameters': n_parameters } if args.output_dir and utils.is_main_process(): with (output_dir / "log.txt").open("a") as f: f.write(json.dumps(log_stats) + "\n") total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def main(): global timeout_sent args = parse_arguments() random.seed(args.seed + args.local_rank) np.random.seed(args.seed + args.local_rank) torch.manual_seed(args.seed + args.local_rank) torch.cuda.manual_seed(args.seed + args.local_rank) worker_init = WorkerInitObj(args.seed + args.local_rank) device, args = setup_training(args) dllogger.log(step="PARAMETER", data={"Config": [str(args)]}) # Prepare optimizer model, optimizer, lr_scheduler, checkpoint, global_step, criterion = prepare_model_and_optimizer(args, device) if args.disable_weight_tying: # Sanity Check that new param is in optimizer print ("SANITY CHECK OPTIMIZER: ", id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']]) assert id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']] print (f"SAVING EVERY {args.num_steps_per_checkpoint} STEPS!") if is_main_process(): dllogger.log(step="PARAMETER", data={"SEED": args.seed}) raw_train_start = None if args.do_train: if is_main_process(): dllogger.log(step="PARAMETER", data={"train_start": True}) dllogger.log(step="PARAMETER", data={"batch_size_per_gpu": args.train_batch_size}) dllogger.log(step="PARAMETER", data={"learning_rate": args.learning_rate}) model.train() most_recent_ckpts_paths = [] average_loss = 0.0 # averaged loss every args.log_freq steps epoch = 0 training_steps = 0 pool = ProcessPoolExecutor(1) # Note: We loop infinitely over epochs, termination is handled via iteration count while True: thread = None restored_data_loader = None if not args.resume_from_checkpoint or epoch > 0 or (args.phase2 and global_step < 1) or args.init_checkpoint: files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and ('training' in f or 'train' in f)] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 else: f_start_id = checkpoint['files'][0] files = checkpoint['files'][1:] args.resume_from_checkpoint = False num_files = len(files) # may not exist in all checkpoints epoch = checkpoint.get('epoch', 0) restored_data_loader = checkpoint.get('data_loader', None) shared_file_list = {} if torch.distributed.is_initialized() and get_world_size() > num_files: remainder = get_world_size() % num_files data_file = files[(f_start_id*get_world_size()+get_rank() + remainder*f_start_id)%num_files] else: data_file = files[(f_start_id*get_world_size()+get_rank())%num_files] previous_file = data_file if restored_data_loader is None: train_data = pretraining_dataset(data_file, args.max_predictions_per_seq) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size * args.n_gpu, num_workers=4, worker_init_fn=worker_init, pin_memory=True) # shared_file_list["0"] = (train_dataloader, data_file) else: train_dataloader = restored_data_loader restored_data_loader = None overflow_buf = None if args.allreduce_post_accumulation: overflow_buf = torch.cuda.IntTensor([0]) for f_id in range(f_start_id + 1 , len(files)): if get_world_size() > num_files: data_file = files[(f_id*get_world_size()+get_rank() + remainder*f_id)%num_files] else: data_file = files[(f_id*get_world_size()+get_rank())%num_files] previous_file = data_file dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init) train_iter = tqdm(train_dataloader, desc="Iteration", disable=args.disable_progress_bar) if is_main_process() else train_dataloader if raw_train_start is None: raw_train_start = time.time() for step, batch in enumerate(train_iter): training_steps += 1 batch = [t.to(device) for t in batch] input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch prediction_scores, seq_relationship_score = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask) loss, mlm_loss, ns_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) if args.n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. mlm_loss = mlm_loss.detach().mean() ns_loss = ns_loss.detach().mean() divisor = args.gradient_accumulation_steps if args.gradient_accumulation_steps > 1: if not args.allreduce_post_accumulation: # this division was merged into predivision loss = loss / args.gradient_accumulation_steps mlm_loss = mlm_loss.detach() / args.gradient_accumulation_steps ns_loss = ns_loss.detach() / args.gradient_accumulation_steps divisor = 1.0 if args.fp16: with amp.scale_loss(loss, optimizer, delay_overflow_check=args.allreduce_post_accumulation) as scaled_loss: scaled_loss.backward() else: loss.backward() average_loss += loss.item() if training_steps % args.gradient_accumulation_steps == 0: lr_scheduler.step() # learning rate warmup global_step = take_optimizer_step(args, optimizer, model, overflow_buf, global_step) if global_step >= args.steps_this_run or timeout_sent: train_time_raw = time.time() - raw_train_start last_num_steps = int(training_steps / args.gradient_accumulation_steps) % args.log_freq last_num_steps = args.log_freq if last_num_steps == 0 else last_num_steps average_loss = torch.tensor(average_loss, dtype=torch.float32).cuda() average_loss = average_loss / (last_num_steps * divisor) if (torch.distributed.is_initialized()): average_loss /= get_world_size() torch.distributed.all_reduce(average_loss) final_loss = average_loss.item() if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"final_loss": final_loss}) elif training_steps % (args.log_freq * args.gradient_accumulation_steps) == 0: if is_main_process(): dllogger.log(step=(epoch, global_step, ), data={"average_loss": average_loss / (args.log_freq * divisor), "step_loss": loss.item() * args.gradient_accumulation_steps / divisor, "learning_rate": optimizer.param_groups[0]['lr'], "mlm_loss" : mlm_loss.item(), "ns_loss" : ns_loss.item()}) average_loss = 0 if global_step >= args.steps_this_run or training_steps % ( args.num_steps_per_checkpoint * args.gradient_accumulation_steps) == 0 or timeout_sent: if is_main_process() and not args.skip_checkpoint: # Save a trained model dllogger.log(step="PARAMETER", data={"checkpoint_step": global_step}) model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self if args.resume_step < 0 or not args.phase2: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)) else: output_save_file = os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step + args.phase1_end_step)) if args.do_train: torch.save({'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'master params': list(amp.master_params(optimizer)), 'files': [f_id] + files, 'epoch': epoch, 'data_loader': None if global_step >= args.max_steps else train_dataloader}, output_save_file) most_recent_ckpts_paths.append(output_save_file) # Exiting the training due to hitting max steps, or being sent a # timeout from the cluster scheduler if global_step >= args.steps_this_run or timeout_sent: del train_dataloader # thread.join() return args, final_loss, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1
def prepare_model_and_optimizer(args, device): # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForPreTraining(config) if args.disable_weight_tying: import torch.nn as nn print ("WARNING!!!!!!! Disabling weight tying for this run") print ("BEFORE ", model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) model.cls.predictions.decoder.weight = torch.nn.Parameter(model.cls.predictions.decoder.weight.clone().detach()) print ("AFTER ", model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) assert (model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) == False checkpoint = None if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1 and not args.init_checkpoint: model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")] args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names]) global_step = args.resume_step if not args.init_checkpoint else 0 if not args.init_checkpoint: checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") else: checkpoint = torch.load(args.init_checkpoint, map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) if args.phase2 and not args.init_checkpoint: global_step -= args.phase1_end_step if is_main_process(): print("resume step from ", args.resume_step) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate) lr_scheduler = PolyWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps, degree=1) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16) amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale model.checkpoint_activations(args.checkpoint_activations) if args.resume_from_checkpoint: if args.phase2 or args.init_checkpoint: keys = list(checkpoint['optimizer']['state'].keys()) #Override hyperparameters from previous checkpoint for key in keys: checkpoint['optimizer']['state'][key]['step'] = global_step for iter, item in enumerate(checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter]['step'] = global_step checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) ) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) criterion = BertPretrainingCriterion(config.vocab_size) if args.disable_weight_tying: # Sanity Check that new param is in optimizer print ("SANITY CHECK OPTIMIZER: ", id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']]) assert id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']] return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
del train_dataloader # thread.join() return args, final_loss, train_time_raw, global_step del train_dataloader # thread.join() # Make sure pool has finished and switch train_dataloader # NOTE: Will block until complete train_dataloader, data_file = dataset_future.result(timeout=None) epoch += 1 if __name__ == "__main__": now = time.time() args, final_loss, train_time_raw, global_step = main() gpu_count = args.n_gpu global_step += args.phase1_end_step if (args.phase2 and args.resume_step > 0) else 0 if args.resume_step == -1: args.resume_step = 0 if torch.distributed.is_initialized(): gpu_count = get_world_size() if is_main_process(): e2e_time = time.time() - now training_perf = args.train_batch_size * args.gradient_accumulation_steps * gpu_count\ * (global_step - args.resume_step + skipped_steps) / train_time_raw dllogger.log(step=tuple(), data={"e2e_train_time": e2e_time, "training_sequences_per_second": training_perf, "final_loss": final_loss, "raw_train_time": train_time_raw }) dllogger.flush()