def main(): global best_prec1, args args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: if args.local_rank == 0: print("=> using pre-trained model '{}'".format(args.arch)) if args.arch.startswith('aognet'): cfg.merge_from_file(os.path.join(args.save_dir, 'config.yaml')) model = aognet_m() if args.arch == 'aognet_m' else aognet_s() checkpoint = torch.load( os.path.join(args.save_dir, 'model_best.pth.tar')) # model.load_state_dict(checkpoint['state_dict']) elif args.arch.startswith('resnet'): model = resnets.__dict__[args.arch](pretrained=True) elif args.arch.startswith('mobilenet'): model = mobilenets.__dict__[args.arch](pretrained=True) else: raise NotImplementedError("Unkown network arch.") else: if args.local_rank == 0: print("=> creating {}".format(args.arch)) # update args cfg.merge_from_file(args.cfg) args.batch_size = cfg.batch_size args.lr = cfg.lr args.momentum = cfg.momentum args.weight_decay = cfg.wd args.nesterov = cfg.nesterov args.epochs = cfg.num_epoch if args.arch.startswith('aognet'): model = aognet_m() if args.arch == 'aognet_m' else aognet_s() elif args.arch.startswith('resnet'): model = resnets.__dict__[args.arch]( zero_init_residual=cfg.norm_zero_gamma_init, num_classes=cfg.num_classes, replace_stride_with_dilation=cfg.resnet. replace_stride_with_dilation, dataset=cfg.dataset, base_inplanes=cfg.resnet.base_inplanes, imagenet_head7x7=cfg.stem.imagenet_head7x7, stem_kernel_size=cfg.stem.stem_kernel_size, stem_stride=cfg.stem.stem_stride, norm_name=cfg.norm_name, norm_groups=cfg.norm_groups, norm_k=cfg.norm_k, norm_attention_mode=cfg.norm_attention_mode, norm_all_mix=cfg.norm_all_mix, extra_norm_ac=cfg.resnet.extra_norm_ac, replace_stride_with_avgpool=cfg.resnet. replace_stride_with_avgpool) elif args.arch.startswith('MobileNetV3'): model = mobilenetsv3.__dict__[args.arch]( norm_name=cfg.norm_name, norm_groups=cfg.norm_groups, norm_k=cfg.norm_k, norm_attention_mode=cfg.norm_attention_mode, rm_se=cfg.mobilenet.rm_se, use_mn_in_se=cfg.mobilenet.use_mn_in_se) elif args.arch.startswith('mobilenet'): model = mobilenets.__dict__[args.arch]( norm_name=cfg.norm_name, norm_groups=cfg.norm_groups, norm_k=cfg.norm_k, norm_attention_mode=cfg.norm_attention_mode) elif args.arch.startswith('densenet'): model = densenets.__dict__[args.arch]( num_classes=cfg.num_classes, imagenet_head7x7=cfg.stem.imagenet_head7x7, norm_name=cfg.norm_name, norm_groups=cfg.norm_groups, norm_k=cfg.norm_k, norm_attention_mode=cfg.norm_attention_mode) else: raise NotImplementedError("Unkown network arch.") if args.local_rank == 0: if cfg.dataset.startswith('cifar'): H, W = 32, 32 elif cfg.dataset.startswith('imagenet'): H, W = 224, 224 else: raise NotImplementedError("Unknown dataset") flops, params = thop_profile(copy.deepcopy(model), input_size=(1, 3, H, W)) print('=> FLOPs: {:.6f}G, Params: {:.6f}M'.format( flops / 1e9, params / 1e6)) print('=> Params (double-check): %.6fM' % (sum(p.numel() for p in model.parameters()) / 1e6)) if args.sync_bn: import apex if args.local_rank == 0: print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() if args.fp16: model = FP16Model(model) if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) if args.pretrained: model.load_state_dict(checkpoint['state_dict']) # Scale learning rate based on global batch size args.lr = args.lr * float( args.batch_size * args.world_size) / cfg.lr_scale_factor #TODO: control the maximum? if args.remove_norm_weight_decay: if args.local_rank == 0: print("=> ! Weight decay NOT applied to FeatNorm parameters ") norm_params = set() #TODO: need to check this via experiments rest_params = set() for m in model.modules(): if isinstance(m, (nn.BatchNorm2d, nn.GroupNorm, MixtureBatchNorm2d, MixtureGroupNorm)): for param in m.parameters(False): norm_params.add(param) else: for param in m.parameters(False): rest_params.add(param) optimizer = torch.optim.SGD([{ 'params': list(norm_params), 'weight_decay': 0.0 }, { 'params': list(rest_params) }], args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) else: if args.local_rank == 0: print("=> ! Weight decay applied to FeatNorm parameters ") optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay, nesterov=args.nesterov) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # define loss function (criterion) and optimizer criterion_train = nn.CrossEntropyLoss().cuda() if cfg.dataaug.labelsmoothing_rate == 0.0 \ else LabelSmoothing(cfg.dataaug.labelsmoothing_rate).cuda() criterion_val = nn.CrossEntropyLoss().cuda() # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): if args.local_rank == 0: print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) if args.local_rank == 0: print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: if args.local_rank == 0: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code lr_milestones = None if cfg.dataset == "cifar10": train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip() ]) train_dataset = datasets.CIFAR10('./datasets', train=True, download=False, transform=train_transform) val_dataset = datasets.CIFAR10('./datasets', train=False, download=False) lr_milestones = cfg.lr_milestones elif cfg.dataset == "cifar100": train_transform = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip() ]) train_dataset = datasets.CIFAR100('./datasets', train=True, download=False, transform=train_transform) val_dataset = datasets.CIFAR100('./datasets', train=False, download=False) lr_milestones = cfg.lr_milestones elif cfg.dataset == "imagenet": traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') crop_size = cfg.crop_size # 224 val_size = cfg.crop_size + 32 # 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop( crop_size, interpolation=cfg.crop_interpolation), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size, interpolation=cfg.crop_interpolation), transforms.CenterCrop(crop_size), ])) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion_val) return scheduler = CosineAnnealingLR( optimizer.optimizer if args.fp16 else optimizer, args.epochs, len(train_loader), eta_min=cfg.cosine_lr_min, warmup=cfg.warmup_epochs) if cfg.use_cosine_lr else None for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion_train, optimizer, epoch, scheduler, lr_milestones, cfg.warmup_epochs, cfg.dataaug.mixup_rate, cfg.dataaug.labelsmoothing_rate) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion_val) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, args.save_dir)
def main(): global best_prec1, args, best_prec5 print(" A") args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print("Warning: if --fp16 is not used, static_loss_scale will be ignored.") # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() teacher = None if args.distillation: teacher = models.resnet50(pretrained=True).cuda() teacher.eval() model = model.cuda() if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) if args.distillation: teacher = DDP(teacher, delay_allreduce=True) criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if args.distillation: teacher = amp.initialize(teacher, opt_level="O1") teacher_2 = amp.initialize(teacher_2, opt_level="O1") print(" C") # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code if len(args.data) == 1: traindir = [] valdir = [] for path in os.listdir(os.path.join(args.data[0], "train")): traindir.append(os.path.join(args.data[0],"train", path)) traindir = sorted(traindir) for path in os.listdir(os.path.join(args.data[0], "validation")): valdir.append(os.path.join(args.data[0], "validation",path)) valdir = sorted(valdir) print(len(valdir), len(traindir)) else: traindir = args.data[0] valdir= args.data[1] if(args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size()['__TFRecordReader_1'] / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator(pipe, size=int(pipe.epoch_size()['__TFRecordReader_5'] / args.world_size)) if args.evaluate: validate(val_loader, model, criterion, teacher) return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): if epoch < 30: args.alpha = 0.9 elif epoch < 60: args.alpha = 0.9 elif epoch < 80: args.alpha = 0.5 elif epoch < 100: args.alpha = 0.1 if epoch == 30 or epoch == 60 or epoch == 90: save_checkpoint({ 'epoch': epoch, 'arch': args.arch, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, False, filename=os.path.join(args.log_str, 'epoch_{}'.format(epoch) + '_checkpoint.pth.tar')) # train for one epoch loss_kd = None if args.distillation: avg_train_time, losses, top1, top5, loss_kd, \ = train_kd(train_loader, model, (teacher,teacher_2), criterion, optimizer, epoch) else: avg_train_time, losses, top1, top5 = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) if args.local_rank == 0: writer.add_scalar('Loss/train', losses, epoch) writer.add_scalar('Accuracy/train_prec1', top1, epoch) writer.add_scalar('Accuracy/train_prec5', top5, epoch) if loss_kd: writer.add_scalar('Loss/train/kd_loss', loss_kd, epoch) if args.prof: break # evaluate on validation set with torch.no_grad(): prec1, prec5, losses, loss_kd = validate(val_loader, model, criterion, teacher) if args.local_rank == 0: writer.add_scalar('Loss/test', losses, epoch) writer.add_scalar('Accuracy/test_prec1', prec1, epoch) writer.add_scalar('Accuracy/test_prec5', prec5, epoch) if loss_kd: writer.add_scalar('Loss/test/loss_kd', loss_kd, epoch) torch.cuda.empty_cache() # val_pipe.release_outputs() # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), 'best_prec5': best_prec5, }, is_best) if epoch == args.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format(prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators del prec5, prec1, losses train_loader.reset() val_loader.reset()
args = argumentparser.parse_args() torch.cuda.set_device(args.local_rank) dist.init_process_group(backend='nccl', init_method='env://') rnd = torch.rand((5, 2)).cuda() rnd_gathered = awesome_allgather_function.apply(rnd) print("gathering random tensors\nbefore\b", rnd, "\nafter\n", rnd_gathered) # so far this works as expected print("now running a DDP model") from apex.parallel import DistributedDataParallel as DDP c = nn.Conv2d(2, 3, 3, 1, 1, 1, 1, True).cuda() c = DDP(c) opt = Adam(c.parameters()) bs = 5 if dist.get_rank() == 0: bs = 4 inp = torch.rand((bs, 2, 5, 5)).cuda() out = c(inp) print("output_shape", out.shape) out_gathered = awesome_allgather_function.apply(out) print("output_shape_after_gather", out_gathered.shape) # this also works loss = out_gathered.sum() loss.backward()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='../../../MuTual/data/mutual', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument("--model_name_or_path", default="google/electra-large-discriminator", type=str) parser.add_argument("--model_type", default="electra", type = str, help = "Pre-trained Model selected in the list: bert, roberta, electra") parser.add_argument("--task_name", default="mutual", type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default="output_mutual_electra_3", type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--max_utterance_num", default=20, type=int, help="The maximum total utterance number.") parser.add_argument("--cache_flag", default="v1", type=str, help="The output directory where the model predictions and checkpoints will be written.") ## Other parameters parser.add_argument("--max_grad_norm", default = 1.0, type = float, help = "The maximum grad norm for clipping") parser.add_argument("--cache_dir", default='../../cached_models', type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--baseline", action='store_true', help="Whether to run baseline.") parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--train_batch_size", default=24, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=24, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=4e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_rnn", default=1, type=int, help="RNN.") parser.add_argument("--num_decouple", default=1, type=int, help="Decoupling Layers.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() processors = { "ubuntu": UbuntuProcessor, 'douban': DoubanProcessor, 'ecd': UbuntuProcessor, "mutual": MuTualProcessor } output_modes = { "ubuntu": "classification", "mutual": "classification", 'douban': "classification", 'ecd': 'classification' } if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name]() output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] if args.baseline: if args.model_type == 'electra': model_class = Baseline elif args.model_type == 'bert': model_class = BertBaseline elif args.model_type == 'roberta': model_class = RobertaBaseline config = config_class.from_pretrained(args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) train_examples = None num_train_optimization_steps = None if args.do_train: eval_examples = processor.get_dev_examples(args.data_dir) for idx, example in enumerate(eval_examples): article = example.text_b[0] lens = [len(i) for i in article] print((idx, len(article), sum(lens))) #print(idx, example.guid) cached_train_features_file = args.data_dir + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( list(filter(None, args.model_name_or_path.split('/'))).pop(), "valid",str(args.task_name), str(args.max_seq_length), str(args.max_utterance_num), str(args.cache_flag)) eval_features = None try: with open(cached_train_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_utterance_num, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_sep_pos = torch.tensor(select_field(eval_features, 'sep_pos'), dtype=torch.long) all_turn_ids = torch.tensor(select_field(eval_features, 'turn_ids'), dtype = torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sep_pos, all_turn_ids, all_label_ids) # Run prediction for full data print(eval_data) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) #------------- train_examples = processor.get_train_examples(args.data_dir) num_train_optimization_steps = int( len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size() if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) global_step = 0 nb_tr_steps = 0 tr_loss = 0 if args.do_train: cached_train_features_file = args.data_dir + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( list(filter(None, args.model_name_or_path.split('/'))).pop(), "train",str(args.task_name), str(args.max_seq_length), str(args.max_utterance_num), str(args.cache_flag)) train_features = None try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, args.max_utterance_num, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) # (batch_size, 1, seq_len) all_input_ids = torch.tensor(select_field(train_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(train_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(train_features, 'segment_ids'), dtype=torch.long) #all_response_len = torch.tensor(select_field(train_features, 'response_len'), dtype=torch.long) all_sep_pos = torch.tensor(select_field(train_features, 'sep_pos'), dtype=torch.long) all_turn_ids = torch.tensor(select_field(train_features, 'turn_ids'), dtype = torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sep_pos, all_turn_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size) eval_examples = processor.get_dev_examples(args.data_dir) cached_train_features_file = args.data_dir + '_{0}_{1}_{2}_{3}_{4}_{5}'.format( list(filter(None, args.model_name_or_path.split('/'))).pop(), "valid",str(args.task_name), str(args.max_seq_length), str(args.max_utterance_num), str(args.cache_flag)) eval_features = None try: with open(cached_train_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, args.max_utterance_num, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor(select_field(eval_features, 'input_ids'), dtype=torch.long) all_input_mask = torch.tensor(select_field(eval_features, 'input_mask'), dtype=torch.long) all_segment_ids = torch.tensor(select_field(eval_features, 'segment_ids'), dtype=torch.long) all_sep_pos = torch.tensor(select_field(eval_features, 'sep_pos'), dtype=torch.long) all_turn_ids = torch.tensor(select_field(eval_features, 'turn_ids'), dtype = torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_sep_pos, all_turn_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) for epoch in trange(int(args.num_train_epochs), desc="Epoch"): model.train() tr_loss = 0 #nb_tr_examples = 0 nb_tr_steps = 0 for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet', 'albert'] else None, # XLM don't use segment_ids 'sep_pos': batch[3], 'turn_ids': batch[4], 'labels': batch[5]} #input_ids, input_mask, segment_ids, response_len, sep_pos, label_ids = batch output = model(**inputs) loss = output[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.detach().item() nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear.get_lr( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, str(epoch) + "_" + WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = None for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) with torch.no_grad(): inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet', 'albert'] else None, # XLM don't use segment_ids 'sep_pos': batch[3], 'turn_ids': batch[4], 'labels': batch[5]} #outputs = eval_model(**inputs) outputs = model(**inputs) tmp_eval_loss, logits = outputs[:2] eval_loss += tmp_eval_loss.detach().mean().item() nb_eval_steps += 1 if preds is None: preds = logits.detach().cpu().numpy() out_label_ids = inputs['labels'].detach().cpu().numpy() else: preds = np.append(preds, logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / nb_tr_steps if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "a") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def train(args, model): """ Train the model """ if args.local_rank in [-1, 0]: os.makedirs(args.output_dir, exist_ok=True) writer = SummaryWriter(log_dir=os.path.join("logs", args.name)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps # Prepare dataset train_loader, test_loader = get_loader(args) # Prepare optimizer and scheduler optimizer = torch.optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay) t_total = args.num_steps if args.decay_type == "cosine": scheduler = WarmupCosineSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) else: scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: model, optimizer = amp.initialize(models=model, optimizers=optimizer, opt_level=args.fp16_opt_level) amp._amp_state.loss_scalers[0]._loss_scale = 2**20 # Distributed training if args.local_rank != -1: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Total optimization steps = %d", args.num_steps) logger.info(" Instantaneous batch size per GPU = %d", args.train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) model.zero_grad() set_seed( args) # Added here for reproducibility (even between python 2 and 3) losses = AverageMeter() global_step, best_acc = 0, 0 while True: model.train() epoch_iterator = tqdm(train_loader, desc="Training (X / X Steps) (loss=X.X)", bar_format="{l_bar}{r_bar}", dynamic_ncols=True, disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) x, y = batch loss = model(x, y) if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: losses.update(loss.item() * args.gradient_accumulation_steps) if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Training (%d / %d Steps) (loss=%2.5f)" % (global_step, t_total, losses.val)) if args.local_rank in [-1, 0]: writer.add_scalar("train/loss", scalar_value=losses.val, global_step=global_step) writer.add_scalar("train/lr", scalar_value=scheduler.get_lr()[0], global_step=global_step) if global_step % args.eval_every == 0 and args.local_rank in [ -1, 0 ]: accuracy = valid(args, model, writer, test_loader, global_step) if best_acc < accuracy: save_model(args, model) best_acc = accuracy model.train() if global_step % t_total == 0: break losses.reset() if global_step % t_total == 0: break if args.local_rank in [-1, 0]: writer.close() logger.info("Best Accuracy: \t%f" % best_acc) logger.info("End Training!")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help="The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_config.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--learning_rate", default=2e-5, type=float, help="The initial learning rate for Adam." ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument( "--no_cuda", action="store_true", help="Whether not to use CUDA when available" ) parser.add_argument( "--do_lower_case", default=True, type=bool, help="Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus" ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help="Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=16, help="Number of workers in the dataloader." ) parser.add_argument( "--save_name", default='', type=str, help="save name for training.", ) parser.add_argument( "--use_chunk", default=0, type=float, help="whether use chunck for parallel training." ) parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training." ) parser.add_argument( "--optimizer", default='BertAdam', type=str, help="whether use chunck for parallel training." ) parser.add_argument( "--tasks", default='', type=str, help="1-2-3... training task separate by -" ) parser.add_argument( "--freeze", default = -1, type=int, help="till which layer of textual stream of vilbert need to fixed." ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not." ) parser.add_argument( "--evaluation_interval", default=1, type=int, help="evaluate very n epoch." ) parser.add_argument( "--lr_scheduler", default='mannul', type=str, help="whether use learning rate scheduler." ) parser.add_argument( "--baseline", action="store_true", help="whether use single stream baseline." ) parser.add_argument( "--compact", action="store_true", help="whether use compact vilbert model." ) args = parser.parse_args() with open('vlbert_tasks.yml', 'r') as f: task_cfg = edict(yaml.load(f)) # random.seed(args.seed) # np.random.seed(args.seed) # torch.manual_seed(args.seed) if args.baseline: from vilbert.basebert import BertConfig from vilbert.basebert import BaseBertForVLTasks elif args.compact: from vilbert.vilbert_compact import BertConfig from vilbert.vilbert_compact import VILBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id name = task_cfg[task]['name'] task_names.append(name) task_lr.append(task_cfg[task]['lr']) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split('-')): task = 'TASK' + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = '-' + args.save_name else: prefix = '' timeStamp = '-'.join(task_names) + '_' + args.config_file.split('/')[1].split('.')[0] + prefix savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load(open("config/" + args.bert_model + "_weight_name.json", "r")) print('args.local_rank is', args.local_rank) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16 ) ) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, 'command.txt'), 'w') as f: print(args, file=f) # Python 3.x print('\n', file=f) print(config, file=f) task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, \ task_dataloader_train, task_dataloader_val = LoadDatasets(args, task_cfg, args.tasks.split('-')) tbLogger = utils.tbLogger(timeStamp, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps) # if n_gpu > 0: # torch.cuda.manual_seed_all(args.seed) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) num_train_optimization_steps = max(task_num_iters.values()) * args.num_train_epochs // args.gradient_accumulation_steps num_labels = max([dataset.num_labels for dataset in task_datasets_train.values()]) task_start_iter = {} task_interval = {} for task_id, num_iter in task_num_iters.items(): task_start_iter[task_id] = num_train_optimization_steps - (task_cfg[task]['num_epoch'] * num_iter // args.gradient_accumulation_steps) task_interval[task_id] = num_train_optimization_steps // (task_cfg[task]['num_epoch'] * num_iter // args.gradient_accumulation_steps) # num_labels = 4 if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config, num_labels=num_labels, default_gpu=default_gpu ) # Config optimizer optimizer_grouped_parameters = [] no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if 'vil_prediction' in key: # if args.learning_rate <= 2e-5: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = args.learning_rate else: lr = 1e-4 else: lr = args.learning_rate if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.01} ] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [ {"params": [value], "lr": lr, "weight_decay": 0.0} ] if args.optimizer == 'BertAdam': optimizer = BertAdam( optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adam': optimizer = Adam( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) elif args.optimizer == 'Adamax': optimizer = Adamax( optimizer_grouped_parameters, lr=base_lr, warmup=args.warmup_proportion, t_total=num_train_optimization_steps, schedule='warmup_constant', ) if args.lr_scheduler == 'automatic': lr_scheduler = ReduceLROnPlateau(optimizer, \ mode='max', factor=0.2, patience=1, cooldown=1, threshold=0.001) elif args.lr_scheduler == 'mannul': lr_reduce_list = np.array([12, 16]) # lr_reduce_list = np.array([6, 8, 10]) def lr_lambda_fun(epoch): return pow(0.1, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) model.to(device) model, optimizer = amp.initialize(model, optimizer, enabled=args.fp16, opt_level='O2') task_losses = LoadLosses(args, task_cfg, args.tasks.split('-')) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if 'embeddings' in name: bert_weight_name_filtered.append(name) elif 'encoder' in name: layer_num = name.split('.')[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) max_num_iter = max(task_num_iters.values()) max_batch_size = max(task_batch_size.values()) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" %num_train_optimization_steps) startIterID = 0 # initialize the data iteration. task_iter_train = {name:None for name in task_ids} task_count = {name:0 for name in task_ids} for epochId in tqdm(range(args.num_train_epochs), desc="Epoch"): model.train() for step in range(max_num_iter): iterId = startIterID + step + (epochId * max_num_iter) for task_id in task_ids: if iterId >= task_start_iter[task_id]: # if iterId % task_interval[task_id] == 0: loss, score = ForwardModelsTrain(args, task_cfg, device, task_id, task_count, task_iter_train, task_dataloader_train, model, task_losses, task_start_iter) loss = loss * loss_scale[task_id] delay_unscale = (step + 1) % args.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale ) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() model.zero_grad() if default_gpu: tbLogger.step_train(epochId, iterId, float(loss), float(score), optimizer.show_lr(), task_id, 'train') if step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu: tbLogger.showLossTrain() model.eval() # when run evaluate, we run each task sequentially. for task_id in task_ids: for i, batch in enumerate(task_dataloader_val[task_id]): with torch.no_grad(): loss, score, batch_size = ForwardModelsVal(args, task_cfg, device, task_id, batch, model, task_losses) tbLogger.step_val(epochId, float(loss), float(score), task_id, batch_size, 'val') if default_gpu: sys.stdout.write('%d/%d\r' % (i, len(task_dataloader_val[task_id]))) sys.stdout.flush() ave_score = tbLogger.showLossVal() if args.lr_scheduler == 'automatic': lr_scheduler.step(ave_score) logger.info("best average score is %3f" %lr_scheduler.best) else: lr_scheduler.step() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model on " + timeStamp + "** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self if not os.path.exists(savePath): os.makedirs(savePath) output_model_file = os.path.join(savePath, "pytorch_model_" + str(epochId) + ".bin") torch.save(model_to_save.state_dict(), output_model_file) tbLogger.txt_close()
target_var = target.cuda(device) # declare the optimizer and criterion criterion = nn.CrossEntropyLoss().cuda(device) with cudnn.flags(enabled=True, benchmark=True): for i in range(total_iters): logging.info(f"local_rank {local_rank} iteration {i}") for j in range(iterations): output = model(input_var, chunks=chunks) loss = criterion(output, target_var) logging.info(f"local_rank {local_rank} loss {loss}") #logging.info(f"local_rank {local_rank} loss requires_grad {loss.requires_grad}") #logging.info(f"local_rank {local_rank} loss grad_fn {loss.grad_fn}") optimizer.zero_grad() if use_amp == 1: with amp.scale_loss( loss, optimizer, delay_unscale=False) as scaled_loss: scaled_loss.backward() if use_amp == 0: loss.backward() count_param = 0 norm_total = 0 for param in model.parameters(): if param.requires_grad: count_param = count_param + 1 norm_total += param.data.norm() logging.info("rank {} parameter norm {} {}".format( local_rank, count_param, norm_total)) optimizer.step() #torch.cuda.synchronize()
def main(): parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() LOGGER.set_model_name("Tacotron2_PyT") LOGGER.set_backends([ dllg.StdOutBackend(log_file=None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1), dllg.JsonBackend(log_file=args.log_file if args.rank == 0 else None, logging_scope=dllg.TRAIN_ITER_SCOPE, iteration_interval=1) ]) LOGGER.timed_block_start("run") LOGGER.register_metric(tags.TRAIN_ITERATION_LOSS, metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("iter_time", metric_scope=dllg.TRAIN_ITER_SCOPE) LOGGER.register_metric("epoch_time", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("run_time", metric_scope=dllg.RUN_SCOPE) LOGGER.register_metric("val_iter_loss", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_items/sec", metric_scope=dllg.EPOCH_SCOPE) LOGGER.register_metric("train_epoch_avg_loss", metric_scope=dllg.EPOCH_SCOPE) log_hardware() model_name = args.model_name parser = models.parse_model_args(model_name, parser) parser.parse_args() args = parser.parse_args() log_args(args) torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark distributed_run = args.world_size > 1 if distributed_run: init_distributed(args, args.world_size, args.rank, args.group_name) LOGGER.log(key=tags.RUN_START) run_start_time = time.time() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_fp16=args.fp16_run, to_cuda=True) epoch_start = 0 if args.resume: resume_model_path = args.resume_tacotron2_path if args.model_name == "Tacotron2" else args.resume_waveglow_path checkpoint = torch.load(resume_model_path, map_location='cpu') epoch_start = checkpoint["epoch"] state_dict = checkpoint['state_dict'] if checkpoint_from_distributed(state_dict): state_dict = unwrap_distributed(state_dict) model.load_state_dict(state_dict) print("restore model %s" % resume_model_path) if distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.fp16_run: optimizer = FP16_Optimizer( optimizer, dynamic_loss_scale=args.dynamic_loss_scaling) try: sigma = args.sigma except AttributeError: sigma = None criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function(model_name, n_frames_per_step) trainset = data_functions.get_data_loader(model_name, args.dataset_path, args.training_files, args) train_sampler = DistributedSampler(trainset) if distributed_run else None train_loader = DataLoader(trainset, num_workers=1, shuffle=False, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader(model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 model.train() LOGGER.log(key=tags.TRAIN_LOOP) for epoch in range(epoch_start, args.epochs): LOGGER.epoch_start() epoch_start_time = time.time() LOGGER.log(key=tags.TRAIN_EPOCH_START, value=epoch) # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False for i, batch in enumerate(train_loader): LOGGER.iteration_start() iter_start_time = time.time() LOGGER.log(key=tags.TRAIN_ITER_START, value=i) print("Batch: {}/{} epoch {}".format(i, len(train_loader), epoch)) start = time.perf_counter() adjust_learning_rate(epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor) model.zero_grad() x, y, num_items = batch_to_gpu(batch) if args.fp16_run: y_pred = model(fp32_to_fp16(x)) loss = criterion(fp16_to_fp32(y_pred), y) else: y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, args.world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") LOGGER.log(key=tags.TRAIN_ITERATION_LOSS, value=reduced_loss) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.fp16_run: optimizer.backward(loss) grad_norm = optimizer.clip_master_grads(args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() overflow = optimizer.overflow if args.fp16_run else False iteration += 1 LOGGER.log(key=tags.TRAIN_ITER_STOP, value=i) iter_stop_time = time.time() iter_time = iter_stop_time - iter_start_time LOGGER.log(key="train_iter_items/sec", value=(reduced_num_items / iter_time)) LOGGER.log(key="iter_time", value=iter_time) LOGGER.iteration_stop() LOGGER.log(key=tags.TRAIN_EPOCH_STOP, value=epoch) epoch_stop_time = time.time() epoch_time = epoch_stop_time - epoch_start_time LOGGER.log(key="train_epoch_items/sec", value=(reduced_num_items_epoch / epoch_time)) LOGGER.log(key="train_epoch_avg_loss", value=(train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0)) LOGGER.log(key="epoch_time", value=epoch_time) LOGGER.log(key=tags.EVAL_START, value=epoch) validate(model, criterion, valset, iteration, args.batch_size, args.world_size, collate_fn, distributed_run, args.rank, batch_to_gpu, args.fp16_run) LOGGER.log(key=tags.EVAL_STOP, value=epoch) if not overflow and (epoch % args.epochs_per_checkpoint == 0) and args.rank == 0: checkpoint_path = os.path.join( args.output_directory, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, epoch, model_config, checkpoint_path) save_sample( model_name, model, args.waveglow_checkpoint, args.tacotron2_checkpoint, args.phrase_path, os.path.join(args.output_directory, "sample_{}_{}.wav".format(model_name, iteration)), args.sampling_rate, args.fp16_run) LOGGER.epoch_stop() run_stop_time = time.time() run_time = run_stop_time - run_start_time LOGGER.log(key="run_time", value=run_time) LOGGER.log(key=tags.RUN_FINAL) print("training time", run_stop_time - run_start_time) LOGGER.timed_block_stop("run") if args.rank == 0: LOGGER.finish()
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) # set seeds np.random.seed(args.seed) cudnn.benchmark = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('args = %s', args) # Get data loaders. traindir = os.path.join(args.data, 'train') validdir = os.path.join(args.data, 'val') # data augmentation normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.2), transforms.ToTensor(), normalize, ]) val_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) train_data = dset.ImageFolder(traindir, transform=train_transform) valid_data = dset.ImageFolder(validdir, transform=val_transform) # dataset split valid_data, test_data = utils.dataset_split(valid_data, len(valid_data)) train_sampler = torch.utils.data.distributed.DistributedSampler(train_data) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8, sampler=train_sampler) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) test_queue = torch.utils.data.DataLoader(test_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=8) # Create model and loss. torch.hub.set_dir('/tmp/hub_cache_%d' % args.local_rank) model = torch.hub.load('pytorch/vision:v0.4.2', 'resnet50', pretrained=False) model = model.cuda() model = DDP(model, delay_allreduce=True) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() # Set up network weights optimizer. if args.optimizer == 'SGD': optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer == 'fromage': optimizer = Fromage(model.parameters(), args.learning_rate) elif args.optimizer == 'adam': optimizer = torch.optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) else: raise NotImplementedError scheduler = torch.optim.lr_scheduler.StepLR(optimizer, gamma=0.1, step_size=30) # Train. global_step = 0 best_acc_top1 = 0 for epoch in range(args.epochs): # Shuffle the sampler, update lrs. train_queue.sampler.set_epoch(epoch + args.seed) # Training. train_acc_top1, train_acc_top5, train_obj, global_step = train( train_queue, model, criterion_smooth, optimizer, global_step) logging.info('epoch %d train_acc %f', epoch, train_acc_top1) writer.add_scalar('train/loss', train_obj, global_step) writer.add_scalar('train/acc_top1', train_acc_top1, global_step) writer.add_scalar('train/acc_top5', train_acc_top5, global_step) writer.add_scalar('train/lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step) # Validation. valid_acc_top1, valid_acc_top5, valid_obj = infer( valid_queue, model, criterion) logging.info('valid_acc_top1 %f', valid_acc_top1) logging.info('valid_acc_top5 %f', valid_acc_top5) writer.add_scalar('val/acc_top1', valid_acc_top1, global_step) writer.add_scalar('val/acc_top5', valid_acc_top5, global_step) writer.add_scalar('val/loss', valid_obj, global_step) # Test test_acc_top1, test_acc_top5, test_obj = infer(test_queue, model, criterion) logging.info('test_acc_top1 %f', test_acc_top1) logging.info('test_acc_top5 %f', test_acc_top5) writer.add_scalar('test/acc_top1', test_acc_top1, global_step) writer.add_scalar('test/acc_top5', test_acc_top5, global_step) writer.add_scalar('test/loss', test_obj, global_step) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True if args.local_rank == 0: utils.save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, is_best, args.save) # Update LR. scheduler.step() writer.flush()
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--train_file", default=None, type=str, required=True, help="The input train corpus.") parser.add_argument( "--bert_model", default=None, type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument( "--output_dir", default=None, type=str, required=True, help="The output directory where the model checkpoints will be written." ) ## Other parameters parser.add_argument( "--max_seq_length", default=128, type=int, help= "The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform.") parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( "--on_memory", action='store_true', help="Whether to load train samples into memory or use disk") parser.add_argument( "--do_lower_case", action='store_true', help= "Whether to lower case the input text. True for uncased models, False for cased models." ) parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass." ) parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--hybrid_attention", action='store_true', help="Whether to use hybrid attention") parser.add_argument("--continue_training", action='store_true', help="Continue training from a checkpoint") parser.add_argument("--no_pretrain", default="", action='store_true', help="Whether not to use pretrained model") parser.add_argument( "--config_path", default="", type=str, help="Where to load the config file when not using pretrained model") parser.add_argument( "--state_dir", default="", type=str, help= "Where to load state dict instead of using Google pre-trained model") parser.add_argument( "--teacher_path", default="", type=str, help="Where to load the config file when not using pretrained model") parser.add_argument("--kd_ratio", default=1.0, type=float, help="Knowledge distillation loss ratio") args = parser.parse_args() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train: raise ValueError( "Training is currently the only implemented execution option. Please set `do_train`." ) if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and not args.continue_training: raise ValueError( "Output directory ({}) already exists and is not empty.".format( args.output_dir)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) #train_examples = None num_train_optimization_steps = None if args.do_train: print("Loading Train Dataset", args.train_file) train_dataset = BERTDataset(args.train_file, tokenizer, seq_len=args.max_seq_length, corpus_lines=None, on_memory=args.on_memory) num_train_optimization_steps = len( train_dataset ) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps / torch.distributed.get_world_size( ) num_train_optimization_steps = math.ceil(num_train_optimization_steps) if args.no_pretrain: if not args.config_path: raise ValueError( "Config file is needed when not using the pretrained model") config = BertConfig(args.config_path) model = BertForMaskedLMStudent(config) if args.state_dir and os.path.exists(args.state_dir): state_dict = torch.load(args.state_dir) if isinstance(state_dict, dict) or isinstance( state_dict, collections.OrderedDict): if 'model' in state_dict: state_dict = state_dict['model'] print("Using my own BERT state dict.") model.load_state_dict(state_dict, strict=False) else: # Prepare model model = BertForMaskedLMStudent.from_pretrained(args.bert_model) teacher_model = BertForMaskedLMTeacher.from_pretrained(args.bert_model) if args.fp16: teacher_model.half() model.half() teacher_model.to(device) model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) teacher_model = DDP(teacher_model) elif n_gpu > 1: teacher_model = torch.nn.DataParallel(teacher_model) model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = BertAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, t_total=num_train_optimization_steps) if args.hybrid_attention: max_seq_length = args.max_seq_length attention_mask = torch.ones(12, max_seq_length, max_seq_length, dtype=torch.long) # left attention attention_mask[:2, :, :] = torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # right attention attention_mask[2:4, :, :] = torch.triu( torch.ones(max_seq_length, max_seq_length, dtype=torch.long)) # local attention, window size = 3 attention_mask[4:6, :, :] = torch.triu( torch.tril( torch.ones(max_seq_length, max_seq_length, dtype=torch.long), 1), -1) attention_mask = torch.cat( [attention_mask.unsqueeze(0) for _ in range(8)]) attention_mask = attention_mask.to(device) else: attention_mask = None global_step = 0 epoch_start = 0 if args.do_train: if os.path.exists(args.output_dir) and os.listdir(args.output_dir): all_cp = os.listdir(args.output_dir) steps = [ int(re.search('_\d+', cp).group()[1:]) for cp in all_cp if re.search('_\d+', cp) ] if len(steps) == 0: raise ValueError( "No existing checkpoint. Please do not use --continue_training." ) max_step = max(steps) # load checkpoint checkpoint = torch.load( os.path.join(args.output_dir, 'checkpoints_' + str(max_step) + '.pt')) logger.info("***** Loading checkpoint *****") logger.info(" Num steps = %d", checkpoint['global_step']) logger.info(" Num epoch = %d", checkpoint['epoch']) logger.info(" Loss = %d, %d", checkpoint['loss'], checkpoint['loss_now']) model.module.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) global_step = checkpoint['global_step'] epoch_start = checkpoint['epoch'] del checkpoint writer = SummaryWriter(log_dir=os.environ['HOME']) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) if args.local_rank == -1: train_sampler = RandomSampler(train_dataset) else: #TODO: check if this works with current data generator from disk that relies on next(file) # (it doesn't return item back by index) train_sampler = DistributedSampler(train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) logger.info("Teacher param nums = %d", sum(param.numel() for param in teacher_model.parameters())) logger.info("Student param nums = %d", sum(param.numel() for param in model.parameters())) model.train() teacher_model.eval() tr_loss_1000 = 0 for ep in trange(epoch_start, int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids = batch with torch.no_grad(): teacher_out = teacher_model(input_ids, segment_ids, input_mask, lm_label_ids) loss = model(input_ids, segment_ids, input_mask, lm_label_ids, targets=teacher_out, hybrid_mask=attention_mask, ratio=args.kd_ratio) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() tr_loss_1000 += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: # modify learning rate with special warm up BERT uses # if args.fp16 is False, BertAdam is used that handles this automatically lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion) for param_group in optimizer.param_groups: param_group['lr'] = lr_this_step optimizer.step() optimizer.zero_grad() global_step += 1 # log the training loss for every 1000 steps if global_step % 100 == 99: writer.add_scalar('mask_distillation/loss', tr_loss_1000 / 100, global_step) logger.info("training steps: %s", global_step) logger.info("training loss per 1000: %s", tr_loss_1000 / 100) tr_loss_1000 = 0 # save the checkpoint for every 10000 steps if global_step % 1000 == 0: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_file = os.path.join( args.output_dir, "checkpoints_" + str(global_step) + ".pt") checkpoint = { 'model': model_to_save.state_dict(), 'optimizer': optimizer.state_dict(), 'epoch': ep, 'global_step': global_step, 'loss': tr_loss / nb_tr_steps, 'loss_now': tr_loss_1000 } if args.do_train: torch.save(checkpoint, output_file) model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin_" + str(ep)) if args.do_train: torch.save(model_to_save.state_dict(), output_model_file) logger.info("training loss: %s", tr_loss / nb_tr_steps) # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") if args.do_train: torch.save(model_to_save.state_dict(), output_model_file)
class Solver(object): def __init__(self): """ :param config: easydict """ self.version = __version__ # logging.info("PyTorch Version {}, Solver Version {}".format(torch.__version__, self.version)) self.distributed = False self.world_size = 1 self.local_rank = 0 self.epoch = 0 self.iteration = 0 self.config = None self.model, self.optimizer, self.lr_policy = None, None, None self.step_decay = 1 if 'WORLD_SIZE' in os.environ: self.world_size = int(os.environ['WORLD_SIZE']) self.distributed = self.world_size > 1 or torch.cuda.device_count() > 1 if self.distributed: dist.init_process_group(backend="nccl", init_method='env://') self.local_rank = dist.get_rank() torch.cuda.set_device(self.local_rank) logging.info('[distributed mode] world size: {}, local rank: {}.'.format(self.world_size, self.local_rank)) else: logging.info('[Single GPU mode]') def build_environ(self): if self.config['environ']['deterministic']: cudnn.benchmark = False cudnn.deterministic = True torch.set_printoptions(precision=10) else: cudnn.benchmark = True if self.config['apex']: assert torch.backends.cudnn.enabled, "Amp requires cudnn backend to be enabled." # set random seed torch.manual_seed(self.config['environ']['seed']) if torch.cuda.is_available(): torch.cuda.manual_seed(self.config['environ']['seed']) np.random.seed(self.config['environ']['seed']) random.seed(self.config['environ']['seed']) def init_from_scratch(self, config): t_start = time.time() self.config = config self.build_environ() # model and optimizer self.model = _get_model(self.config) model_params = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = _get_optimizer(config['solver']['optimizer'], model_params=model_params) self.lr_policy = _get_lr_policy(config['solver']['lr_policy'], optimizer=self.optimizer) self.step_decay = config['solver']['step_decay'] if config['model'].get('pretrained_model') is not None: logging.info('loadding pretrained model from {}.'.format(config['model']['pretrained_model'])) load_model(self.model, config['model']['pretrained_model'], distributed=False) self.model.cuda(self.local_rank) if self.distributed: self.model = convert_syncbn_model(self.model) if self.config['apex']['amp_used']: # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. logging.info("Initialize Amp. opt level={}, keep batchnorm fp32={}, loss_scale={}.". format(self.config['apex']['opt_level'], self.config['apex']['keep_batchnorm_fp32'], self.config['apex']['loss_scale'])) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.config['apex']['opt_level'], keep_batchnorm_fp32=self.config['apex']["keep_batchnorm_fp32"], loss_scale=self.config['apex']["loss_scale"]) if self.distributed: self.model = DistributedDataParallel(self.model) t_end = time.time() logging.info("Init trainer from scratch, Time usage: IO: {}".format(t_end - t_start)) def init_from_checkpoint(self, continue_state_object): t_start = time.time() self.config = continue_state_object['config'] self.build_environ() self.model = _get_model(self.config) model_params = filter(lambda p: p.requires_grad, self.model.parameters()) self.optimizer = _get_optimizer(self.config['solver']['optimizer'], model_params=model_params) self.lr_policy = _get_lr_policy(self.config['solver']['lr_policy'], optimizer=self.optimizer) load_model(self.model, continue_state_object['model'], distributed=False) self.model.cuda(self.local_rank) if self.distributed: self.model = convert_syncbn_model(self.model) if self.config['apex']['amp_used']: # Initialize Amp. Amp accepts either values or strings for the optional override arguments, # for convenient interoperation with argparse. logging.info("Initialize Amp. opt level={}, keep batchnorm fp32={}, loss_scale={}.". format(self.config['apex']['opt_level'], self.config['apex']['keep_batchnorm_fp32'], self.config['apex']['loss_scale'])) self.model, self.optimizer = amp.initialize(self.model, self.optimizer, opt_level=self.config['apex']['opt_level'], keep_batchnorm_fp32=self.config['apex']["keep_batchnorm_fp32"], loss_scale=self.config['apex']["loss_scale"]) amp.load_state_dict(continue_state_object['amp']) if self.distributed: self.model = DistributedDataParallel(self.model) self.optimizer.load_state_dict(continue_state_object['optimizer']) self.lr_policy.load_state_dict(continue_state_object['lr_policy']) self.step_decay = self.config['solver']['step_decay'] self.epoch = continue_state_object['epoch'] self.iteration = continue_state_object["iteration"] del continue_state_object t_end = time.time() logging.info("Init trainer from checkpoint, Time usage: IO: {}".format(t_end - t_start)) def step(self, **kwargs): """ :param kwargs: :return: """ self.iteration += 1 loss = self.model(**kwargs) loss /= self.step_decay # backward if self.distributed and self.config['apex']['amp_used']: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if self.iteration % self.step_decay == 0: self.optimizer.step() self.optimizer.zero_grad() if self.distributed: reduced_loss = reduce_tensor(loss.data, self.world_size) else: reduced_loss = loss.data return reduced_loss def step_no_grad(self, **kwargs): with torch.no_grad(): out = self.model(**kwargs) return out def before_epoch(self, epoch): self.iteration = 0 self.epoch = epoch self.model.train() self.synchronize() torch.cuda.empty_cache() self.lr_policy.step(epoch) def after_epoch(self, epoch): self.model.eval() self.synchronize() torch.cuda.empty_cache() def synchronize(self): synchronize() def save_checkpoint(self, path): if self.local_rank == 0: # logging.info("Saving checkpoint to file {}".format(path)) t_start = time.time() state_dict = {} from collections import OrderedDict new_state_dict = OrderedDict() for k, v in self.model.state_dict().items(): key = k if k.split('.')[0] == 'module': key = k[7:] new_state_dict[key] = v if self.config['apex']['amp_used']: state_dict['amp'] = amp.state_dict() state_dict['config'] = self.config state_dict['model'] = new_state_dict state_dict['optimizer'] = self.optimizer.state_dict() state_dict['lr_policy'] = self.lr_policy.state_dict() state_dict['epoch'] = self.epoch state_dict['iteration'] = self.iteration t_iobegin = time.time() torch.save(state_dict, path) del state_dict del new_state_dict t_end = time.time() logging.info( "Save checkpoint to file {}, " "Time usage:\n\tprepare snapshot: {}, IO: {}".format( path, t_iobegin - t_start, t_end - t_iobegin)) def save_images(self, filenames, image): raise NotImplementedError def copy_config(self, snapshot_dir, config_file): ensure_dir(snapshot_dir) assert osp.exists(config_file), "config file is not existed." new_file_name = osp.join(snapshot_dir, 'config.json') shutil.copy(config_file, new_file_name) def __enter__(self): return self def __exit__(self, type, value, tb): torch.cuda.empty_cache() if type is not None: logging.warning( "A exception occurred during Engine initialization, " "give up pspnet_ade process") return False
def main(): #----------------------------------------------------------------------------------- sys.stdout = open('taco_k_log.txt', 'a') #----------------------------------------------------------------------------------- parser = argparse.ArgumentParser(description='PyTorch Tacotron 2 Training') parser = parse_args(parser) args, _ = parser.parse_known_args() if 'LOCAL_RANK' in os.environ and 'WORLD_SIZE' in os.environ: local_rank = int(os.environ['LOCAL_RANK']) world_size = int(os.environ['WORLD_SIZE']) else: local_rank = args.rank world_size = args.world_size distributed_run = world_size > 1 if local_rank == 0: DLLogger.init(backends=[ JSONStreamBackend(Verbosity.DEFAULT, args.output + '/' + args.log_file), StdOutBackend(Verbosity.VERBOSE) ]) else: DLLogger.init(backends=[]) for k, v in vars(args).items(): DLLogger.log(step="PARAMETER", data={k: v}) DLLogger.log(step="PARAMETER", data={'model_name': 'Tacotron2_PyT'}) model_name = args.model_name parser = models.parse_model_args(model_name, parser) args, _ = parser.parse_known_args() torch.backends.cudnn.enabled = args.cudnn_enabled torch.backends.cudnn.benchmark = args.cudnn_benchmark if distributed_run: init_distributed(args, world_size, local_rank, args.group_name) torch.cuda.synchronize() run_start_time = time.perf_counter() model_config = models.get_model_config(model_name, args) model = models.get_model(model_name, model_config, to_cuda=True, uniform_initialize_bn_weight=not args. disable_uniform_initialize_bn_weight) if not args.amp_run and distributed_run: model = DDP(model) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) if args.amp_run: model, optimizer = amp.initialize(model, optimizer, opt_level="O1") if distributed_run: model = DDP(model) try: sigma = args.sigma except AttributeError: sigma = None start_epoch = [0] if args.checkpoint_path is not "": load_checkpoint(model, optimizer, start_epoch, model_config, args.amp_run, args.checkpoint_path) start_epoch = start_epoch[0] criterion = loss_functions.get_loss_function(model_name, sigma) try: n_frames_per_step = args.n_frames_per_step except AttributeError: n_frames_per_step = None collate_fn = data_functions.get_collate_function(model_name, n_frames_per_step) trainset = data_functions.get_data_loader(model_name, args.dataset_path, args.training_files, args) if distributed_run: train_sampler = DistributedSampler(trainset) shuffle = False else: train_sampler = None shuffle = True train_loader = DataLoader(trainset, num_workers=1, shuffle=shuffle, sampler=train_sampler, batch_size=args.batch_size, pin_memory=False, drop_last=True, collate_fn=collate_fn) valset = data_functions.get_data_loader(model_name, args.dataset_path, args.validation_files, args) batch_to_gpu = data_functions.get_batch_to_gpu(model_name) iteration = 0 train_epoch_items_per_sec = 0.0 val_loss = 0.0 num_iters = 0 model.train() for epoch in range(start_epoch, args.epochs): torch.cuda.synchronize() epoch_start_time = time.perf_counter() # used to calculate avg items/sec over epoch reduced_num_items_epoch = 0 # used to calculate avg loss over epoch train_epoch_avg_loss = 0.0 train_epoch_items_per_sec = 0.0 num_iters = 0 # if overflow at the last iteration then do not save checkpoint overflow = False if distributed_run: train_loader.sampler.set_epoch(epoch) for i, batch in enumerate(train_loader): torch.cuda.synchronize() iter_start_time = time.perf_counter() DLLogger.log(step=(epoch, i), data={ 'glob_iter/iters_per_epoch': str(iteration) + "/" + str(len(train_loader)) }) adjust_learning_rate(iteration, epoch, optimizer, args.learning_rate, args.anneal_steps, args.anneal_factor, local_rank) model.zero_grad() x, y, num_items = batch_to_gpu(batch) y_pred = model(x) loss = criterion(y_pred, y) if distributed_run: reduced_loss = reduce_tensor(loss.data, world_size).item() reduced_num_items = reduce_tensor(num_items.data, 1).item() else: reduced_loss = loss.item() reduced_num_items = num_items.item() if np.isnan(reduced_loss): raise Exception("loss is NaN") DLLogger.log(step=(epoch, i), data={'train_loss': reduced_loss}) train_epoch_avg_loss += reduced_loss num_iters += 1 # accumulate number of items processed in this epoch reduced_num_items_epoch += reduced_num_items if args.amp_run: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_thresh) else: loss.backward() grad_norm = torch.nn.utils.clip_grad_norm_( model.parameters(), args.grad_clip_thresh) optimizer.step() torch.cuda.synchronize() iter_stop_time = time.perf_counter() iter_time = iter_stop_time - iter_start_time items_per_sec = reduced_num_items / iter_time train_epoch_items_per_sec += items_per_sec #DLLogger.log(step=(epoch, i), data={'train_items_per_sec': items_per_sec}) #DLLogger.log(step=(epoch, i), data={'train_iter_time': iter_time}) iteration += 1 torch.cuda.synchronize() epoch_stop_time = time.perf_counter() epoch_time = epoch_stop_time - epoch_start_time #DLLogger.log(step=(epoch,), data={'train_items_per_sec': # (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) DLLogger.log(step=(epoch, ), data={ 'train_loss': (train_epoch_avg_loss / num_iters if num_iters > 0 else 0.0) }) #DLLogger.log(step=(epoch,), data={'train_epoch_time': epoch_time}) val_loss = validate(model, criterion, valset, epoch, i, args.batch_size, world_size, collate_fn, distributed_run, local_rank, batch_to_gpu) if (epoch % args.epochs_per_checkpoint == 0) and local_rank == 0 and args.bench_class == "": checkpoint_path = os.path.join( args.output, "checkpoint_{}_{}".format(model_name, epoch)) save_checkpoint(model, optimizer, epoch, model_config, args.amp_run, checkpoint_path) if local_rank == 0: DLLogger.flush() torch.cuda.synchronize() run_stop_time = time.perf_counter() run_time = run_stop_time - run_start_time #DLLogger.log(step=tuple(), data={'run_time': run_time}) DLLogger.log(step=tuple(), data={'val_loss': val_loss}) #DLLogger.log(step=tuple(), data={'train_items_per_sec': # (train_epoch_items_per_sec/num_iters if num_iters > 0 else 0.0)}) if local_rank == 0: DLLogger.flush()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) model, optimizer = amp.initialize(model, optimizer, opt_level="O2") if args.distributed: # FOR DISTRIBUTED: After amp.initialize, wrap the model with # apex.parallel.DistributedDataParallel. model = DistributedDataParallel(model) # torch.nn.parallel.DistributedDataParallel is also fine, with some added args: # model = torch.nn.parallel.DistributedDataParallel(model, # device_ids=[args.local_rank], # output_device=args.local_rank) loss_fn = torch.nn.MSELoss() for t in range(500): optimizer.zero_grad() y_pred = model(x) loss = loss_fn(y_pred, y) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print("final loss = ", loss) torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank())) torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
model = torch.nn.Linear(D_in, D_out).cuda() optimizer = torch.optim.SGD(model.parameters(), lr=1e-3) model, optimizer = amp.initialize(model, optimizer, opt_level=args.opt_level) if args.distributed: # FOR DISTRIBUTED: After amp.initialize, wrap the model with # apex.parallel.DistributedDataParallel. model = DistributedDataParallel(model) # torch.nn.parallel.DistributedDataParallel is also fine, with some added args: # model = torch.nn.parallel.DistributedDataParallel(model, # device_ids=[args.local_rank], # output_device=args.local_rank) loss_fn = torch.nn.MSELoss() for t in range(500): optimizer.zero_grad() y_pred = model(x) loss = loss_fn(y_pred, y) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.step() if args.local_rank == 0: print("final loss = ", loss) torch.save(list(model.parameters()), "rank{}model.pth".format(torch.distributed.get_rank())) torch.save(list(amp.master_params(optimizer)), "rank{}master.pth".format(torch.distributed.get_rank()))
model_single = Glow( 1, args.n_flow, args.n_block, affine=args.affine, conv_lu=not args.no_lu ).cpu() if len(args.load_path) > 0: model_single.load_state_dict(torch.load(args.load_path, map_location=lambda storage, loc: storage)) model_single.initialize() gc.collect() torch.cuda.empty_cache() model_single = model_single.to(device) else: model_single = model_single.to(device) first_batch = next(iter(sample_data(args.path, 8, args.img_size, rank=0))) with torch.no_grad(): print (args.local_rank, first_batch.mean(), first_batch.std()) log_p, logdet = model_single(first_batch.to(device)) print (args.local_rank, log_p) log_p, logdet = model_single(first_batch.to(device)) print (args.local_rank, log_p) dp_device_ids = [args.local_rank] model = DDP(model_single, allreduce_always_fp32=True) #, device_ids=dp_device_ids, output_device=args.local_rank) optimizer = optim.Adam(model.parameters(), lr=args.lr) # if len(args.load_path) > 0: # optim_path = '/'.join(args.load_path.split('/')[:-1]) # optimizer.load_state_dict(torch.load(os.path.join(optim_path, 'optimizer.pth'), map_location=lambda storage, loc: storage)) # gc.collect() # torch.cuda.empty_cache() train(args, model, optimizer)
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.total_batch_size = args.world_size * args.batch_size if not os.path.isdir(args.checkpoint) and args.local_rank == 0: mkdir_p(args.checkpoint) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale, verbose=False) # optionally resume from a checkpoint title = 'ImageNet-' + args.arch if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title, resume=True) else: print("=> no checkpoint found at '{}'".format(args.resume)) else: if args.local_rank == 0: logger = Logger(os.path.join(args.checkpoint, 'log.txt'), title=title) logger.set_names([ 'Learning Rate', 'Train Loss', 'Valid Loss', 'Train Acc.', 'Valid Acc.', 'Valid Top5.' ]) traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size // 2, num_threads=args.workers // 2, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) if args.evaluate: validate(val_loader, model, criterion) return total_time = AverageMeter() for epoch in range(args.start_epoch, args.epochs): # train for one epoch adjust_learning_rate(optimizer, epoch, args) if args.local_rank == 0: print('\nEpoch: [%d | %d] LR: %f' % (epoch + 1, args.epochs, optimizer.param_groups[0]['lr'])) [train_loss, train_acc, avg_train_time] = train(train_loader, model, criterion, optimizer, epoch) total_time.update(avg_train_time) # evaluate on validation set [test_loss, prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: # append logger file logger.append([ optimizer.param_groups[0]['lr'], train_loss, test_loss, train_acc, prec1, prec5 ]) is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best, checkpoint=args.checkpoint) if epoch == args.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}\n' '##Perf {2}'.format( prec1, prec5, args.total_batch_size / total_time.avg)) # reset DALI iterators train_loader.reset() val_loader.reset() if args.local_rank == 0: logger.close()
def main(): global best_prec1, args args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param turns off bucketing in DDP, for lower latency runs this can improve perf model = DDP(model, shared_param=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Scale learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def train(args, train_dataset, model): """ Train the model """ args.train_batch_size = args.per_gpu_train_batch_size // args.gradient_accumulation_steps train_sampler = RandomSampler( train_dataset) if args.local_rank == -1 else DistributedSampler( train_dataset) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) t_total = len(train_dataloader ) // args.gradient_accumulation_steps * args.num_train_epochs no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=t_total * args.warmup_proportion, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) if args.fp16_opt_level == "O2": keep_batchnorm_fp32 = False else: keep_batchnorm_fp32 = True model, optimizer = amp.initialize( model, optimizer, opt_level=args.fp16_opt_level, keep_batchnorm_fp32=keep_batchnorm_fp32) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP( model, message_size=250000000, gradient_predivide_factor=torch.distributed.get_world_size()) # Train! logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) global_step = 0 epochs = 0 model.zero_grad() model.train() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Train(XX Epoch) Step(X/X) (loss=X.X)", disable=args.local_rank not in [-1, 0], leave=True, position=0) for step, batch in enumerate(epoch_iterator): batch = tuple(t.to(args.device) for t in batch) # multi-gpu does scattering it-self input_ids, input_mask, segment_ids, start_positions, end_positions = batch outputs = model(input_ids, segment_ids, input_mask, start_positions, end_positions) loss = outputs # model outputs are always tuple in transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel (not distributed) training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) scheduler.step() # Update learning rate schedule\ optimizer.step() optimizer.zero_grad() global_step += 1 epoch_iterator.set_description( "Train(%d Epoch) Step(%d / %d) (loss=%5.5f)" % (_, global_step, t_total, loss.item())) if args.local_rank in [-1, 0]: if epochs < 10: n_epochs = '0' + str(epochs) else: n_epochs = str(epochs) model_checkpoint = "korquad_{0}_{1}_{2}_{3}_{4}_{5}_{6}.bin".format( args.learning_rate, args.train_batch_size, n_epochs, int(args.num_train_epochs), args.eda_type, args.num_aug, args.alpha) logger.info(model_checkpoint) output_model_file = os.path.join(args.output_dir, model_checkpoint) if args.n_gpu > 1 or args.local_rank != -1: logger.info("** ** * Saving file * ** ** (module)") torch.save(model.module.state_dict(), output_model_file) else: logger.info("** ** * Saving file * ** **") torch.save(model.state_dict(), output_model_file) epochs += 1 logger.info("Training End!!!")
hidden_dim=hidden_dim, num_layers=num_layers, rnn_type=rnn_type, bidirectional=bidirectional).to(device) decoder = Decoder(hidden_dim=hidden_dim, output_dim=output_dim, num_layers=num_layers, rnn_type=rnn_type, bidirectional=bidirectional).to(device) model = Seq2Seq(encoder, decoder).to(device) model.apply(init_weights) # weight initialization # parallel if args.distributed: model = DDP(model, delay_allreduce=True) # multi gpu optimizer = optim.SGD(model.parameters(), lr=lr) if args.opt == "sgd" else optim.Adam( model.parameters(), lr=lr) if str(device) == 'cuda': criterion = nn.NLLLoss( ignore_index=target_field.vocab.stoi['<pad>']).cuda() else: criterion = nn.NLLLoss(ignore_index=target_field.vocab.stoi['<pad>']) # Training if do_train: load_epoch = 0 train_losses = [] eval_scores = [] best_eval_score = -float("inf") # Load Existing Model if args.resume:
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=args.num_classes) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.rank, data_dir=traindir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator train_loader = DALIClassificationIterator(pipe, size=int(1281167 / args.world_size)) pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.rank, data_dir=valdir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator val_loader = DALIClassificationIterator(pipe, size=int(50000 / args.world_size)) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def main(): parser = ArgumentParser() parser.add_argument('--pregenerated_data', type=Path, required=True) parser.add_argument('--output_dir', type=Path, required=True) parser.add_argument( "--bert_model", type=str, required=True, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese." ) parser.add_argument("--do_lower_case", action="store_true") parser.add_argument( "--reduce_memory", action="store_true", help= "Store training data as on-disc memmaps to massively reduce memory usage" ) parser.add_argument("--epochs", type=int, default=3, help="Number of epochs to train for") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument( '--gradient_accumulation_steps', type=int, default=1, help= "Number of updates steps to accumulate before performing a backward/update pass." ) parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.") parser.add_argument( '--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument( '--loss_scale', type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--learning_rate", default=3e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument("--logging_steps", default=10, type=int, help="Number of update steps between two logs.") args = parser.parse_args() assert args.pregenerated_data.is_dir(), \ "--pregenerated_data should point to the folder of files made by pregenerate_training_data.py!" samples_per_epoch = [] for i in range(args.epochs): epoch_file = args.pregenerated_data / f"epoch_{i}.json" metrics_file = args.pregenerated_data / f"epoch_{i}_metrics.json" if epoch_file.is_file() and metrics_file.is_file(): metrics = json.loads(metrics_file.read_text()) samples_per_epoch.append(metrics['num_training_examples']) else: if i == 0: exit("No training data was found!") print( f"Warning! There are fewer epochs of pregenerated data ({i}) than training epochs ({args.epochs})." ) print( "This script will loop over the available data, but training diversity may be negatively impacted." ) num_data_epochs = i break else: num_data_epochs = args.epochs if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logging.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, should be >= 1" .format(args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if args.output_dir.is_dir() and list(args.output_dir.iterdir()): logging.warning( f"Output directory ({args.output_dir}) already exists and is not empty!" ) args.output_dir.mkdir(parents=True, exist_ok=True) tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) total_train_examples = 0 for i in range(args.epochs): # The modulo takes into account the fact that we may loop over limited epochs of data total_train_examples += samples_per_epoch[i % len(samples_per_epoch)] num_train_optimization_steps = int(total_train_examples / args.train_batch_size / args.gradient_accumulation_steps) if args.local_rank != -1: num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size( ) # Prepare model model = BertForPreTraining.from_pretrained(args.bert_model) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.fp16: try: from apex.optimizers import FP16_Optimizer from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate, bias_correction=False, max_grad_norm=1.0) if args.loss_scale == 0: optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True) else: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale) else: optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps) global_step = 0 logging.info("***** Running training *****") logging.info(f" Num examples = {total_train_examples}") logging.info(" Batch size = %d", args.train_batch_size) logging.info(" Num steps = %d", num_train_optimization_steps) model.train() wandb.init(project="huggingface", config=vars(args)) wandb.watch(model, log_freq=max(100, args.logging_steps)) logging_loss = 0.0 for epoch in range(args.epochs): epoch_dataset = PregeneratedDataset( epoch=epoch, training_path=args.pregenerated_data, tokenizer=tokenizer, num_data_epochs=num_data_epochs, reduce_memory=args.reduce_memory) if args.local_rank == -1: train_sampler = RandomSampler(epoch_dataset) else: train_sampler = DistributedSampler(epoch_dataset) train_dataloader = DataLoader(epoch_dataset, sampler=train_sampler, batch_size=args.train_batch_size) tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 with tqdm(total=len(train_dataloader), desc=f"Epoch {epoch}") as pbar: for step, batch in enumerate(train_dataloader): logs = {} batch = tuple(t.to(device) for t in batch) input_ids, input_mask, segment_ids, lm_label_ids, is_next = batch outputs = model(input_ids, segment_ids, input_mask, lm_label_ids, is_next) loss = outputs[0] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 pbar.update(1) mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps pbar.set_postfix_str(f"Loss: {mean_loss:.5f}") if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) scheduler.step() # Update learning rate schedule optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % args.logging_steps == 0: wandb.log({"loss": mean_loss}, step=global_step) # Save a trained model if n_gpu > 1 and torch.distributed.get_rank() == 0 or n_gpu <= 1: logging.info("** ** * Saving fine-tuned model ** ** * ") model.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir)
class nnUNetTrainerV2_DDP(nnUNetTrainerV2): def __init__(self, plans_file, fold, local_rank, output_folder=None, dataset_directory=None, batch_dice=True, stage=None, unpack_data=True, deterministic=True, distribute_batch_size=False, fp16=False): super().__init__(plans_file, fold, output_folder, dataset_directory, batch_dice, stage, unpack_data, deterministic, fp16) self.init_args = (plans_file, fold, local_rank, output_folder, dataset_directory, batch_dice, stage, unpack_data, deterministic, distribute_batch_size, fp16) self.distribute_batch_size = distribute_batch_size np.random.seed(local_rank) torch.manual_seed(local_rank) torch.cuda.manual_seed_all(local_rank) self.local_rank = local_rank torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://') self.val_loss_ma_alpha = 0.95 self.val_loss_MA = None self.loss = None self.ce_loss = CrossentropyND() self.global_batch_size = None # we need to know this to properly steer oversample def set_batch_size_and_oversample(self): batch_sizes = [] oversample_percents = [] world_size = dist.get_world_size() my_rank = dist.get_rank() if self.distribute_batch_size: self.global_batch_size = self.batch_size else: self.global_batch_size = self.batch_size * world_size batch_size_per_GPU = np.ceil(self.batch_size / world_size).astype(int) for rank in range(world_size): if self.distribute_batch_size: if (rank + 1) * batch_size_per_GPU > self.batch_size: batch_size = batch_size_per_GPU - ( (rank + 1) * batch_size_per_GPU - self.batch_size) else: batch_size = batch_size_per_GPU else: batch_size = self.batch_size batch_sizes.append(batch_size) sample_id_low = 0 if len(batch_sizes) == 0 else np.sum( batch_sizes[:-1]) sample_id_high = np.sum(batch_sizes) if sample_id_high / self.global_batch_size < ( 1 - self.oversample_foreground_percent): oversample_percents.append(0.0) elif sample_id_low / self.global_batch_size > ( 1 - self.oversample_foreground_percent): oversample_percents.append(1.0) else: percent_covered_by_this_rank = sample_id_high / self.global_batch_size - sample_id_low / self.global_batch_size oversample_percent_here = 1 - ( ((1 - self.oversample_foreground_percent) - sample_id_low / self.global_batch_size) / percent_covered_by_this_rank) oversample_percents.append(oversample_percent_here) print("worker", my_rank, "oversample", oversample_percents[my_rank]) print("worker", my_rank, "batch_size", batch_sizes[my_rank]) self.batch_size = batch_sizes[my_rank] self.oversample_foreground_percent = oversample_percents[my_rank] def save_checkpoint(self, fname, save_optimizer=True): if self.local_rank == 0: super().save_checkpoint(fname, save_optimizer) def plot_progress(self): if self.local_rank == 0: super().plot_progress() def print_to_log_file(self, *args, also_print_to_console=True): if self.local_rank == 0: super().print_to_log_file( *args, also_print_to_console=also_print_to_console) def initialize_network(self): """ This is specific to the U-Net and must be adapted for other network architectures :return: """ self.print_to_log_file(self.net_num_pool_op_kernel_sizes) self.print_to_log_file(self.net_conv_kernel_sizes) if self.threeD: conv_op = nn.Conv3d dropout_op = nn.Dropout3d norm_op = nn.InstanceNorm3d else: conv_op = nn.Conv2d dropout_op = nn.Dropout2d norm_op = nn.InstanceNorm2d norm_op_kwargs = {'eps': 1e-5, 'affine': True} dropout_op_kwargs = {'p': 0, 'inplace': True} net_nonlin = nn.LeakyReLU net_nonlin_kwargs = {'negative_slope': 1e-2, 'inplace': True} self.network = Generic_UNet( self.num_input_channels, self.base_num_features, self.num_classes, len(self.net_num_pool_op_kernel_sizes), self.conv_per_stage, 2, conv_op, norm_op, norm_op_kwargs, dropout_op, dropout_op_kwargs, net_nonlin, net_nonlin_kwargs, True, False, lambda x: x, InitWeights_He(1e-2), self.net_num_pool_op_kernel_sizes, self.net_conv_kernel_sizes, False, True, True) self.network.cuda() self.network.inference_apply_nonlin = softmax_helper def process_plans(self, plans): super().process_plans(plans) self.set_batch_size_and_oversample() def initialize(self, training=True, force_load_plans=False): """ For prediction of test cases just set training=False, this will prevent loading of training data and training batchgenerator initialization :param training: :return: """ if not self.was_initialized: maybe_mkdir_p(self.output_folder) if force_load_plans or (self.plans is None): self.load_plans_file() self.process_plans(self.plans) self.setup_DA_params() self.folder_with_preprocessed_data = join( self.dataset_directory, self.plans['data_identifier'] + "_stage%d" % self.stage) if training: self.dl_tr, self.dl_val = self.get_basic_generators() if self.unpack_data: if self.local_rank == 0: print("unpacking dataset") unpack_dataset(self.folder_with_preprocessed_data) print("done") else: # we need to wait until worker 0 has finished unpacking npz_files = subfiles( self.folder_with_preprocessed_data, suffix=".npz", join=False) case_ids = [i[:-4] for i in npz_files] all_present = all([ isfile( join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids ]) while not all_present: print("worker", self.local_rank, "is waiting for unpacking") sleep(3) all_present = all([ isfile( join(self.folder_with_preprocessed_data, i + ".npy")) for i in case_ids ]) # there is some slight chance that there may arise some error because dataloader are loading a file # that is still being written by worker 0. We ignore this for now an address it only if it becomes # relevant # (this can occur because while worker 0 writes the file is technically present so the other workers # will proceed and eventually try to read it) else: print( "INFO: Not unpacking data! Training may be slow due to that. Pray you are not using 2d or you " "will wait all winter for your model to finish!") # setting weights for deep supervision losses net_numpool = len(self.net_num_pool_op_kernel_sizes) # we give each output a weight which decreases exponentially (division by 2) as the resolution decreases # this gives higher resolution outputs more weight in the loss weights = np.array([1 / (2**i) for i in range(net_numpool)]) # we don't use the lowest 2 outputs. Normalize weights so that they sum to 1 mask = np.array([ True if i < net_numpool - 1 else False for i in range(net_numpool) ]) weights[~mask] = 0 weights = weights / weights.sum() self.ds_loss_weights = weights seeds_train = np.random.random_integers( 0, 99999, self.data_aug_params.get('num_threads')) seeds_val = np.random.random_integers( 0, 99999, max(self.data_aug_params.get('num_threads') // 2, 1)) print("seeds train", seeds_train) print("seeds_val", seeds_val) self.tr_gen, self.val_gen = get_moreDA_augmentation( self.dl_tr, self.dl_val, self.data_aug_params['patch_size_for_spatialtransform'], self.data_aug_params, deep_supervision_scales=self.deep_supervision_scales, seeds_train=seeds_train, seeds_val=seeds_val) self.print_to_log_file("TRAINING KEYS:\n %s" % (str(self.dataset_tr.keys())), also_print_to_console=False) self.print_to_log_file("VALIDATION KEYS:\n %s" % (str(self.dataset_val.keys())), also_print_to_console=False) else: pass self.initialize_network() self.initialize_optimizer_and_scheduler() self._maybe_init_amp() self.network = DDP(self.network) else: self.print_to_log_file( 'self.was_initialized is True, not running self.initialize again' ) self.was_initialized = True def run_iteration(self, data_generator, do_backprop=True, run_online_evaluation=False): data_dict = next(data_generator) data = data_dict['data'] target = data_dict['target'] data = maybe_to_torch(data) target = maybe_to_torch(target) data = to_cuda(data, gpu_id=None) target = to_cuda(target, gpu_id=None) self.optimizer.zero_grad() output = self.network(data) del data total_loss = None for i in range(len(output)): # Starting here it gets spicy! axes = tuple(range(2, len(output[i].size()))) # network does not do softmax. We need to do softmax for dice output_softmax = softmax_helper(output[i]) # get the tp, fp and fn terms we need tp, fp, fn, _ = get_tp_fp_fn_tn(output_softmax, target[i], axes, mask=None) # for dice, compute nominator and denominator so that we have to accumulate only 2 instead of 3 variables # do_bg=False in nnUNetTrainer -> [:, 1:] nominator = 2 * tp[:, 1:] denominator = 2 * tp[:, 1:] + fp[:, 1:] + fn[:, 1:] if self.batch_dice: # for DDP we need to gather all nominator and denominator terms from all GPUS to do proper batch dice nominator = awesome_allgather_function.apply(nominator) denominator = awesome_allgather_function.apply(denominator) nominator = nominator.sum(0) denominator = denominator.sum(0) else: pass ce_loss = self.ce_loss(output[i], target[i]) # we smooth by 1e-5 to penalize false positives if tp is 0 dice_loss = (-(nominator + 1e-5) / (denominator + 1e-5)).mean() if total_loss is None: total_loss = self.ds_loss_weights[i] * (ce_loss + dice_loss) else: total_loss += self.ds_loss_weights[i] * (ce_loss + dice_loss) if run_online_evaluation: with torch.no_grad(): num_classes = output[0].shape[1] output_seg = output[0].argmax(1) target = target[0][:, 0] axes = tuple(range(1, len(target.shape))) tp_hard = torch.zeros( (target.shape[0], num_classes - 1)).to(output_seg.device.index) fp_hard = torch.zeros( (target.shape[0], num_classes - 1)).to(output_seg.device.index) fn_hard = torch.zeros( (target.shape[0], num_classes - 1)).to(output_seg.device.index) for c in range(1, num_classes): tp_hard[:, c - 1] = sum_tensor( (output_seg == c).float() * (target == c).float(), axes=axes) fp_hard[:, c - 1] = sum_tensor( (output_seg == c).float() * (target != c).float(), axes=axes) fn_hard[:, c - 1] = sum_tensor( (output_seg != c).float() * (target == c).float(), axes=axes) # tp_hard, fp_hard, fn_hard = get_tp_fp_fn((output_softmax > (1 / num_classes)).float(), target, # axes, None) # print_if_rank0("before allgather", tp_hard.shape) tp_hard = tp_hard.sum(0, keepdim=False)[None] fp_hard = fp_hard.sum(0, keepdim=False)[None] fn_hard = fn_hard.sum(0, keepdim=False)[None] tp_hard = awesome_allgather_function.apply(tp_hard) fp_hard = awesome_allgather_function.apply(fp_hard) fn_hard = awesome_allgather_function.apply(fn_hard) # print_if_rank0("after allgather", tp_hard.shape) # print_if_rank0("after sum", tp_hard.shape) self.run_online_evaluation( tp_hard.detach().cpu().numpy().sum(0), fp_hard.detach().cpu().numpy().sum(0), fn_hard.detach().cpu().numpy().sum(0)) del target if do_backprop: if not self.fp16 or amp is None: total_loss.backward() else: with amp.scale_loss(total_loss, self.optimizer) as scaled_loss: scaled_loss.backward() _ = clip_grad_norm_(self.network.parameters(), 12) self.optimizer.step() return total_loss.detach().cpu().numpy() def run_online_evaluation(self, tp, fp, fn): self.online_eval_foreground_dc.append( list((2 * tp) / (2 * tp + fp + fn + 1e-8))) self.online_eval_tp.append(list(tp)) self.online_eval_fp.append(list(fp)) self.online_eval_fn.append(list(fn)) def run_training(self): """ if we run with -c then we need to set the correct lr for the first epoch, otherwise it will run the first continued epoch with self.initial_lr we also need to make sure deep supervision in the network is enabled for training, thus the wrapper :return: """ self.maybe_update_lr( self.epoch ) # if we dont overwrite epoch then self.epoch+1 is used which is not what we # want at the start of the training if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = True ret = nnUNetTrainer.run_training(self) net.do_ds = ds return ret def validate(self, do_mirroring: bool = True, use_train_mode: bool = False, tiled: bool = True, step: int = 2, save_softmax: bool = True, use_gaussian: bool = True, overwrite: bool = True, validation_folder_name: str = 'validation_raw', debug: bool = False, all_in_gpu: bool = False, force_separate_z: bool = None, interpolation_order: int = 3, interpolation_order_z=0): if self.local_rank == 0: if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = False ret = nnUNetTrainer.validate( self, do_mirroring, use_train_mode, tiled, step, save_softmax, use_gaussian, overwrite, validation_folder_name, debug, all_in_gpu, force_separate_z=force_separate_z, interpolation_order=interpolation_order, interpolation_order_z=interpolation_order_z) net.do_ds = ds return ret def predict_preprocessed_data_return_softmax(self, data, do_mirroring, num_repeats, use_train_mode, batch_size, mirror_axes, tiled, tile_in_z, step, min_size, use_gaussian, all_in_gpu=False): """ Don't use this. If you need softmax output, use preprocess_predict_nifti and set softmax_output_file. :param data: :param do_mirroring: :param num_repeats: :param use_train_mode: :param batch_size: :param mirror_axes: :param tiled: :param tile_in_z: :param step: :param min_size: :param use_gaussian: :param use_temporal: :return: """ valid = list((SegmentationNetwork, nn.DataParallel, DDP)) assert isinstance(self.network, tuple(valid)) if isinstance(self.network, DDP): net = self.network.module else: net = self.network ds = net.do_ds net.do_ds = False ret = net.predict_3D(data, do_mirroring, num_repeats, use_train_mode, batch_size, mirror_axes, tiled, tile_in_z, step, min_size, use_gaussian=use_gaussian, pad_border_mode=self.inference_pad_border_mode, pad_kwargs=self.inference_pad_kwargs, all_in_gpu=all_in_gpu)[2] net.do_ds = ds return ret def load_checkpoint_ram(self, saved_model, train=True): """ used for if the checkpoint is already in ram :param saved_model: :param train: :return: """ if not self.was_initialized: self.initialize(train) new_state_dict = OrderedDict() curr_state_dict_keys = list(self.network.state_dict().keys()) # if state dict comes form nn.DataParallel but we use non-parallel model here then the state dict keys do not # match. Use heuristic to make it match for k, value in saved_model['state_dict'].items(): key = k if key not in curr_state_dict_keys: print("duh") key = key[7:] new_state_dict[key] = value # if we are fp16, then we need to reinitialize the network and the optimizer. Otherwise amp will throw an error if self.fp16: self.network, self.optimizer, self.lr_scheduler = None, None, None self.initialize_network() self.initialize_optimizer_and_scheduler() # we need to reinitialize DDP here self.network = DDP(self.network) self.network.load_state_dict(new_state_dict) self.epoch = saved_model['epoch'] if train: optimizer_state_dict = saved_model['optimizer_state_dict'] if optimizer_state_dict is not None: self.optimizer.load_state_dict(optimizer_state_dict) if self.lr_scheduler is not None and hasattr( self.lr_scheduler, 'load_state_dict' ) and saved_model['lr_scheduler_state_dict'] is not None: self.lr_scheduler.load_state_dict( saved_model['lr_scheduler_state_dict']) if issubclass(self.lr_scheduler.__class__, _LRScheduler): self.lr_scheduler.step(self.epoch) self.all_tr_losses, self.all_val_losses, self.all_val_losses_tr_mode, self.all_val_eval_metrics = saved_model[ 'plot_stuff'] # after the training is done, the epoch is incremented one more time in my old code. This results in # self.epoch = 1001 for old trained models when the epoch is actually 1000. This causes issues because # len(self.all_tr_losses) = 1000 and the plot function will fail. We can easily detect and correct that here if self.epoch != len(self.all_tr_losses): self.print_to_log_file( "WARNING in loading checkpoint: self.epoch != len(self.all_tr_losses). This is " "due to an old bug and should only appear when you are loading old models. New " "models should have this fixed! self.epoch is now set to len(self.all_tr_losses)" ) self.epoch = len(self.all_tr_losses) self.all_tr_losses = self.all_tr_losses[:self.epoch] self.all_val_losses = self.all_val_losses[:self.epoch] self.all_val_losses_tr_mode = self.all_val_losses_tr_mode[:self. epoch] self.all_val_eval_metrics = self.all_val_eval_metrics[:self.epoch] self.amp_initialized = False self._maybe_init_amp()
def main(): global best_prec1, args args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() if args.sync_bn: import apex print("using apex synced BN") model = apex.parallel.convert_syncbn_model(model) model = model.cuda() if args.distributed: # By default, apex.parallel.DistributedDataParallel overlaps communication with # computation in the backward pass. # model = DDP(model) # delay_allreduce delays all communication to the end of the backward pass. model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() # Scale learning rate based on global batch size args.lr = args.lr * float(args.batch_size * args.world_size) / 256. optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # Optionally resume from a checkpoint if args.resume: # Use a local scope to avoid dangling references def resume(): if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) resume() # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(crop_size), transforms.RandomHorizontalFlip(), # transforms.ToTensor(), Too slow # normalize, ])) val_dataset = datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(val_size), transforms.CenterCrop(crop_size), ])) train_sampler = None val_sampler = None if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, collate_fn=fast_collate) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True, sampler=val_sampler, collate_fn=fast_collate) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best)
def prepare_model_and_optimizer(args, device): # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForPreTraining(config) if args.disable_weight_tying: import torch.nn as nn print ("WARNING!!!!!!! Disabling weight tying for this run") print ("BEFORE ", model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) model.cls.predictions.decoder.weight = torch.nn.Parameter(model.cls.predictions.decoder.weight.clone().detach()) print ("AFTER ", model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) assert (model.cls.predictions.decoder.weight is model.bert.embeddings.word_embeddings.weight) == False checkpoint = None if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1 and not args.init_checkpoint: model_names = [f for f in os.listdir(args.output_dir) if f.endswith(".pt")] args.resume_step = max([int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names]) global_step = args.resume_step if not args.init_checkpoint else 0 if not args.init_checkpoint: checkpoint = torch.load(os.path.join(args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") else: checkpoint = torch.load(args.init_checkpoint, map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) if args.phase2 and not args.init_checkpoint: global_step -= args.phase1_end_step if is_main_process(): print("resume step from ", args.resume_step) model.to(device) param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}] optimizer = FusedAdam(optimizer_grouped_parameters, lr=args.learning_rate) lr_scheduler = PolyWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps, degree=1) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16) amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale model.checkpoint_activations(args.checkpoint_activations) if args.resume_from_checkpoint: if args.phase2 or args.init_checkpoint: keys = list(checkpoint['optimizer']['state'].keys()) #Override hyperparameters from previous checkpoint for key in keys: checkpoint['optimizer']['state'][key]['step'] = global_step for iter, item in enumerate(checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter]['step'] = global_step checkpoint['optimizer']['param_groups'][iter]['t_total'] = args.max_steps checkpoint['optimizer']['param_groups'][iter]['warmup'] = args.warmup_proportion checkpoint['optimizer']['param_groups'][iter]['lr'] = args.learning_rate optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0,) ) elif args.n_gpu > 1: model = torch.nn.DataParallel(model) criterion = BertPretrainingCriterion(config.vocab_size) if args.disable_weight_tying: # Sanity Check that new param is in optimizer print ("SANITY CHECK OPTIMIZER: ", id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']]) assert id(model.module.cls.predictions.decoder.weight) in [id(g) for g in optimizer.param_groups[0]['params']] return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
def main(): parser = argparse.ArgumentParser() parser.add_argument("--bert_model", default='bert-base-cased', type=str, help="transformers中的模型都可: bert-base-uncased, roberta-base.") parser.add_argument("--output_dir", default='output', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--output_file", # default='output_batch4_gpu4_large_qo_lamda10_fp16.txt', default='output_file.txt', type=str, help="The output directory where the model checkpoints will be written.") parser.add_argument("--train_file", default='data/sem/ntrain.tsv', type=str) parser.add_argument("--test_file", default='data/sem/ntest.tsv', type=str) parser.add_argument("--dev_file", default='data/sem/ndev.tsv', type=str) parser.add_argument('--n_gpu', type=int, default=2, help='Loss scaling, positive power of 2 values can improve fp16 convergence.') parser.add_argument("--max_seq_length", default=512, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--train_batch_size", default=4, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=4, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-6, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=50.0, type=float, help="Total number of training epochs to perform.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--do_train", default=False, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=False, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--do_lower_case",#用uncased无大小写模型时要这个 default=True, action='store_true', help="Set this flag if you are using an uncased model.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument("--no_cuda", default=False, action='store_true', help="Whether not to use CUDA when available") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--optimize_on_cpu', default=False, action='store_true', help="Whether to perform optimization and keep the optimizer averages on CPU") parser.add_argument('--fp16', default=False, action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=4,#原来是4 help='Loss scaling, positive power of 2 values can improve fp16 convergence.') #增加dev集 parser.add_argument("--dev_batch_size", default=8, type=int, help="Total batch size for dev.") parser.add_argument("--print_step", default=50, type=int, help="多少步进行模型保存以及日志信息写入") parser.add_argument("--early_stop", type=int, default=50, help="提前终止,多少次dev acc 不再连续增大,就不再训练") parser.add_argument("--label_list", default=["0", "1", "2", "3", "4"], type=list, help="我自己加的类别标签") parser.add_argument("--predict_test_file", default='ntest_sg_label.tsv', type=str) parser.add_argument("--log_dir", default="log_dir", type=str, help="日志目录,主要用于 tensorboard 分析") args = parser.parse_args() logger.info(args) output_eval_file = os.path.join(args.output_dir, args.output_file) os.makedirs(args.output_dir, exist_ok=True) os.makedirs(args.log_dir, exist_ok=True)#如果已经存在,不抛出异常 with open(output_eval_file, "w") as writer: writer.write("%s\t\n" % args) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = args.n_gpu else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = args.n_gpu # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps) #为了复现 random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # 为所有GPU设置随机种子 torch.backends.cudnn.enabled = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False os.environ['PYTHONHASHSEED'] = str(args.seed) # 为了禁止hash随机化,使得实验可复现。 def seed_worker(worker_id): worker_seed = torch.initial_seed() % 2 ** 32 np.random.seed(worker_seed) random.seed(worker_seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") #读数据,生成dataframe df_train = pd.read_csv(args.train_file, sep='\t') df_dev = pd.read_csv(args.dev_file, sep='\t') df_test = pd.read_csv(args.test_file, sep='\t') # Load the pretrained Tokenizer tokenizer = AutoTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5, output_attentions=False, output_hidden_states=False) # tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case) # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5, # output_attentions=False, output_hidden_states=False) model.to(device) if args.fp16: model.half() model.to(device) if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.") model = DDP(model) elif n_gpu > 1: model = torch.nn.DataParallel(model) param_optimizer = list(model.named_parameters()) # hack to remove pooler, which is not used# thus it produce None grad that break apex param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]] no_decay = ['bias', 'gamma', 'beta'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay_rate': 0.0} ] def encode_fn(text_list): all_input_ids = [] for text in text_list: input_ids = tokenizer.encode(text, add_special_tokens=True, max_length=128, return_tensors='pt',pad_to_max_length=True) # 这个长度得改!!! all_input_ids.append(input_ids) all_input_ids = torch.cat(all_input_ids, dim=0) return all_input_ids criterion = torch.nn.CrossEntropyLoss()#加了torch criterion = criterion.to(device) if args.do_train: # Create the data loader train_text_values = df_train['sentence'].values all_input_ids = encode_fn(train_text_values) labels = df_train['label'].values labels = torch.tensor(labels - 1) # 减一,让标签从0开始 train_data = TensorDataset(all_input_ids, labels) train_dataloader = DataLoader(train_data, batch_size=args.train_batch_size, shuffle=True,worker_init_fn=seed_worker) # _init_fn dev_text_values = df_dev['sentence'].values dall_input_ids = encode_fn(dev_text_values) dlabels = df_dev['label'].values dlabels = torch.tensor(dlabels - 1) # 减一,让标签从0开始 dev_data = TensorDataset(dall_input_ids, dlabels) dev_dataloader = DataLoader(dev_data, batch_size=args.dev_batch_size, worker_init_fn=seed_worker) num_train_steps = int( len(df_train) / args.train_batch_size / args.gradient_accumulation_steps * args.num_train_epochs) # create optimizer and learning rate schedule optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, correct_bias=False) # 要重现BertAdam特定的行为,需设置correct_bias = False #total_steps = len(train_dataloader) * args.epoch scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(args.warmup_proportion*num_train_steps), num_training_steps=num_train_steps)#num_warmup_steps不知道 logger.info("***** Running training *****transformers") logger.info(" Num examples = %d", len(df_train)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_steps) logger.info("***** Running dev *****") logger.info(" Num examples = %d", len(df_dev)) logger.info(" Batch size = %d", args.dev_batch_size) with open(output_eval_file, "a") as writer:### writer.write("\t\n***** Running training *****transformers\t\n") writer.write(" Num examples = %d\t\n" % len(df_train)) writer.write(" Batch size = %d\t\n" % args.train_batch_size) writer.write(" Num steps = %d\t\n" % num_train_steps) writer.write("\t\n***** Running dev *****transformers\t\n") writer.write(" Num examples = %d\t\n" % len(df_dev)) writer.write(" Batch size = %d\t\n" % args.dev_batch_size) global_step = 0 best_acc = 0 early_stop_times = 0 writer = SummaryWriter( log_dir=args.log_dir + '/' + time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime(time.time()))) num_model = 0 num_bestacc=0 for epoch in trange(int(args.num_train_epochs), desc="Epoch"): if early_stop_times >= args.early_stop: print('early_stop......') break print(f'---------------- Epoch: {epoch + 1:02} ----------') epoch_loss = 0 all_preds = np.array([], dtype=int) all_labels = np.array([], dtype=int) train_steps = 0 for step, batch in enumerate(tqdm(train_dataloader, ncols=50, desc="Iteration")):#新增ncols,进度条长度。默认是10 model.train() # 这个位置正确,保证每一个batch都能进入model.train()的模式 ##传统的训练函数进来一个batch的数据,计算一次梯度,更新一次网络,而这里用了梯度累加(gradient accumulation) ##梯度累加就是,每次获取1个batch的数据,计算1次梯度,梯度不清空,不断累加,累加一定次数后,根据累加的梯度更新网络参数,然后清空梯度,进行下一次循环。 # 梯度累加步骤:1. input output 获取loss:输入文本和标签,通过infer计算得到预测值,计算损失函数 out1 = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0] > 0).to(device), labels=batch[1].to(device)) loss, logits = out1[:2] if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.fp16 and args.loss_scale != 1.0: # rescale loss for fp16 training # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html loss = loss * args.loss_scale # 2.loss.backward() 反向传播,计算当前梯度 2.1 loss regularization if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps train_steps += 1 # 2.2 back propagation if args.fp16: optimizer.backward(loss) else: loss.backward()## 反向传播求解梯度 # 用于画图和分析的数据 epoch_loss += loss.item() preds = logits.detach().cpu().numpy() outputs = np.argmax(preds, axis=1) all_preds = np.append(all_preds, outputs) label_ids = batch[1].to('cpu').numpy() all_labels = np.append(all_labels, label_ids) # 3. 多次循环步骤1-2,不清空梯度,使梯度累加在已有梯度上 update parameters of net #梯度累加了一定次数后,先optimizer.step() 根据累计的梯度更新网络参数,然后optimizer.zero_grad() 清空过往梯度,为下一波梯度累加做准备 if (step + 1) % args.gradient_accumulation_steps == 0: torch.nn.utils.clip_grad_norm_(model.parameters(), 1)#optimizer_grouped_parameters # 梯度裁剪不再在AdamW中了#大于1的梯度将其设为1.0, 以防梯度爆炸。解决神经网络训练过拟合。只在训练的时候使用,在测试的时候不用 optimizer.step()## 更新权重参数 # update parameters of net scheduler.step() optimizer.zero_grad()## 梯度清零 # reset gradient global_step += 1 #新增dev数据集调参 if global_step % args.print_step == 0 and global_step != 0: num_model += 1 train_loss = epoch_loss / train_steps train_acc, train_report = classifiction_metric(all_preds, all_labels, args.label_list) dev_loss, dev_acc, dev_report, _, _, _ = evaluate(model, dev_dataloader, criterion, device, args.label_list) c = global_step // args.print_step writer.add_scalar("loss/train", train_loss, c) writer.add_scalar("loss/dev", dev_loss, c) writer.add_scalar("micro_f1/train", train_acc, c)##acc/train writer.add_scalar("micro_f1/dev", dev_acc, c)##acc/dev for label in args.label_list: writer.add_scalar(label + "_" + "f1/train", train_report[label]['f1-score'], c) writer.add_scalar(label + "_" + "f1/dev", dev_report[label]['f1-score'], c) print_list = ['macro', 'weighted'] for label in print_list: writer.add_scalar(label + "_avg_" +"f1/train", train_report[label+' avg']['f1-score'], c) writer.add_scalar(label + "_avg_" + "f1/dev", dev_report[label+' avg']['f1-score'], c) # 以 acc 取优 if dev_acc > best_acc: num_bestacc += 1 best_acc = dev_acc # Save a trained model model_to_save = model.module if hasattr(model,'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) early_stop_times = 0 else: early_stop_times += 1 with open(output_eval_file, "a") as writer:### writer.write("\t\n***** Ending dev *****transformers\t\n") writer.write(" global_step : %d\t\n" % global_step) writer.write(" num_model : %d\t\n" % num_model) writer.write(" num_bestacc : %d\t\n" % num_bestacc) if args.do_eval: # dataframe保存带标签的预测文件ntest_label.tsv,格式:id,text,label,predict_label df = pd.DataFrame(columns=['text', 'label', 'predict_label']) df['text']=df_test['sentence'] # Create the test data loader test_text_values = df_test['sentence'].values tall_input_ids = encode_fn(test_text_values) tlabels = df_test['label'].values tlabels = torch.tensor(tlabels - 1) # 减一,让标签从0开始 pred_data = TensorDataset(tall_input_ids,tlabels) pred_dataloader = DataLoader(pred_data, batch_size=args.eval_batch_size, worker_init_fn=seed_worker) logger.info("***** Running evaluation *****transformers") logger.info(" Num examples = %d", len(df_test)) logger.info(" Batch size = %d", args.eval_batch_size) output_eval_file = os.path.join(args.output_dir, "result.txt") output_model_file = os.path.join(args.output_dir, "_pytorch_model.bin") model_state_dict = torch.load(output_model_file) model = AutoModelForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict, output_attentions=False, output_hidden_states=False) # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=5,state_dict=model_state_dict, # output_attentions=False, output_hidden_states=False) model.to(device) logger.info("Start evaluating") print("=======================") print("test_total...") _,eval_accuracy, eval_report, all_logits, all_preds, all_labels = evaluate(model, pred_dataloader,criterion, device, args.label_list) df['predict_label'] = all_preds df['label'] = all_labels ntest_sg_label = os.path.join(args.output_dir, args.predict_test_file) df.to_csv(ntest_sg_label, sep='\t') eval_macro_f1 = eval_report['macro avg']['f1-score'] result = {'eval_accuracy': eval_accuracy,'eval_macro_f1':eval_macro_f1} with open(output_eval_file, "a") as writer: writer.write("***** Running evaluation *****transformers\t\n") writer.write(" Num examples = %d\t\n" % df.shape[0]) writer.write(" Batch size = %d\t\n" % args.eval_batch_size) logger.info("***** Eval results *****transformers") writer.write("\t\n***** Eval results %s *****transformers\t\n" % ( time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))) for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\t" % (key, str(result[key]))) writer.write("\t\n") np.savetxt(args.output_dir+'/all_logits_transf.txt', all_logits.reshape(-1,5))
def main(): global best_prec1, args args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load( args.resume, map_location=lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code if len(args.data) == 1: traindir = os.path.join(args.data[0], 'train') valdir = os.path.join(args.data[0], 'val') else: traindir = args.data[0] valdir = args.data[1] if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 pipe = HybridTrainPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=traindir, crop=crop_size, dali_cpu=args.dali_cpu) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=valdir, crop=crop_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set [prec1, prec5] = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.local_rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict(), }, is_best) if epoch == args.epochs - 1: print('##Top-1 {0}\n' '##Top-5 {1}'.format(prec1, prec5)) # reset DALI iterators train_loader.reset() val_loader.reset()
def prepare_model_and_optimizer(args, device): # Prepare model config = modeling.BertConfig.from_json_file(args.config_file) # Padding for divisibility by 8 if config.vocab_size % 8 != 0: config.vocab_size += 8 - (config.vocab_size % 8) modeling.ACT2FN["bias_gelu"] = modeling.bias_gelu_training model = modeling.BertForPreTraining(config) checkpoint = None if not args.resume_from_checkpoint: global_step = 0 else: if args.resume_step == -1 and not args.init_checkpoint: model_names = [ f for f in os.listdir(args.output_dir) if f.endswith(".pt") ] args.resume_step = max([ int(x.split('.pt')[0].split('_')[1].strip()) for x in model_names ]) global_step = args.resume_step if not args.init_checkpoint else 0 if not args.init_checkpoint: checkpoint = torch.load(os.path.join( args.output_dir, "ckpt_{}.pt".format(global_step)), map_location="cpu") else: checkpoint = torch.load(args.init_checkpoint, map_location="cpu") model.load_state_dict(checkpoint['model'], strict=False) if args.phase2 and not args.init_checkpoint: global_step -= args.phase1_end_step if is_main_process(): print("resume step from ", args.resume_step) model.to(device) # BERT modeling uses weight sharing between word embedding and prediction decoder. # So make sure the storage is pointing properly even after model is moved to device. if args.use_habana: model.cls.predictions.decoder.weight = model.bert.embeddings.word_embeddings.weight param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'gamma', 'beta', 'LayerNorm'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] if args.use_habana: if args.use_fused_lamb: try: from hb_custom import FusedLamb except ImportError: raise ImportError("Please install hbopt.") optimizer = FusedLamb(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = NVLAMB(optimizer_grouped_parameters, lr=args.learning_rate) else: if torch.cuda.is_available(): optimizer = FusedLAMB(optimizer_grouped_parameters, lr=args.learning_rate) else: optimizer = NVLAMB(optimizer_grouped_parameters, lr=args.learning_rate) lr_scheduler = PolyWarmUpScheduler(optimizer, warmup=args.warmup_proportion, total_steps=args.max_steps) if args.fp16: if args.loss_scale == 0: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale="dynamic", cast_model_outputs=torch.float16) else: model, optimizer = amp.initialize(model, optimizer, opt_level="O2", loss_scale=args.loss_scale, cast_model_outputs=torch.float16) amp._amp_state.loss_scalers[0]._loss_scale = args.init_loss_scale model.checkpoint_activations(args.checkpoint_activations) if args.resume_from_checkpoint: if args.phase2 or args.init_checkpoint: keys = list(checkpoint['optimizer']['state'].keys()) #Override hyperparameters from previous checkpoint for key in keys: checkpoint['optimizer']['state'][key]['step'] = global_step for iter, item in enumerate( checkpoint['optimizer']['param_groups']): checkpoint['optimizer']['param_groups'][iter][ 'step'] = global_step checkpoint['optimizer']['param_groups'][iter][ 't_total'] = args.max_steps checkpoint['optimizer']['param_groups'][iter][ 'warmup'] = args.warmup_proportion checkpoint['optimizer']['param_groups'][iter][ 'lr'] = args.learning_rate optimizer.load_state_dict(checkpoint['optimizer']) # , strict=False) # Restore AMP master parameters if args.fp16: optimizer._lazy_init_maybe_master_weights() optimizer._amp_stash.lazy_init_called = True optimizer.load_state_dict(checkpoint['optimizer']) for param, saved_param in zip(amp.master_params(optimizer), checkpoint['master params']): param.data.copy_(saved_param.data) if args.local_rank != -1: if not args.allreduce_post_accumulation: if not args.use_jit_trace: if args.use_habana: model = DDP(model) else: model = DDP(model, message_size=250000000, gradient_predivide_factor=get_world_size()) else: flat_dist_call([param.data for param in model.parameters()], torch.distributed.broadcast, (0, )) elif args.n_pu > 1: model = torch.nn.DataParallel(model) criterion = BertPretrainingCriterion(config.vocab_size) return model, optimizer, lr_scheduler, checkpoint, global_step, criterion
def main(config): save_path = config['save_path'] epochs = config['epochs'] os.environ['TORCH_HOME'] = config['torch_home'] distributed = config['use_DDP'] start_ep = 0 start_cnt = 0 # initialize model print("Initializing model...") if distributed: initialize_distributed(config) rank = config['rank'] # map string name to class constructor model = get_model(config) model.apply(init_weights) if config['resume_ckpt'] is not None: # load weights from checkpoint state_dict = load_weights(config['resume_ckpt']) model.load_state_dict(state_dict) print("Moving model to GPU") model.cuda(torch.cuda.current_device()) print("Setting up losses") if config['use_vgg']: criterionVGG = Vgg19PerceptualLoss(config['reduced_w']) criterionVGG.cuda() validationLoss = criterionVGG if config['use_gan']: use_sigmoid = config['no_lsgan'] disc_input_channels = 3 discriminator = MultiscaleDiscriminator(disc_input_channels, config['ndf'], config['n_layers_D'], 'instance', use_sigmoid, config['num_D'], False, False) discriminator.apply(init_weights) if config['resume_ckpt_D'] is not None: # load weights from checkpoint print("Resuming discriminator from %s" % (config['resume_ckpt_D'])) state_dict = load_weights(config['resume_ckpt_D']) discriminator.load_state_dict(state_dict) discriminator.cuda(torch.cuda.current_device()) criterionGAN = GANLoss(use_lsgan=not config['no_lsgan']) criterionGAN.cuda() criterionFeat = nn.L1Loss().cuda() if config['use_l2']: criterionMSE = nn.MSELoss() criterionMSE.cuda() validationLoss = criterionMSE # initialize dataloader print("Setting up dataloaders...") train_dataloader, val_dataloader, train_sampler = setup_dataloaders(config) print("Done!") # run the training loop print("Initializing optimizers...") optimizer_G = optim.Adam(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) if config['resume_ckpt_opt_G'] is not None: optimizer_G_state_dict = torch.load( config['resume_ckpt_opt_G'], map_location=lambda storage, loc: storage) optimizer_G.load_state_dict(optimizer_G_state_dict) if config['use_gan']: optimizer_D = optim.Adam(discriminator.parameters(), lr=config['learning_rate']) if config['resume_ckpt_opt_D'] is not None: optimizer_D_state_dict = torch.load( config['resume_ckpt_opt_D'], map_location=lambda storage, loc: storage) optimizer_D.load_state_dict(optimizer_D_state_dict) print("Done!") if distributed: print("Moving model to DDP...") model = DDP(model) if config['use_gan']: discriminator = DDP(discriminator, delay_allreduce=True) print("Done!") tb_logger = None if rank == 0: tb_logdir = os.path.join(save_path, 'tbdir') if not os.path.exists(tb_logdir): os.makedirs(tb_logdir) tb_logger = SummaryWriter(tb_logdir) # run training if not os.path.exists(save_path): os.makedirs(save_path) log_name = os.path.join(save_path, 'loss_log.txt') opt_name = os.path.join(save_path, 'opt.yaml') print(config) save_options(opt_name, config) log_handle = open(log_name, 'a') print("Starting training") cnt = start_cnt assert (config['use_warped'] or config['use_temporal']) for ep in range(start_ep, epochs): if train_sampler is not None: train_sampler.set_epoch(ep) for curr_batch in train_dataloader: optimizer_G.zero_grad() input_a = curr_batch['input_a'].cuda() target = curr_batch['target'].cuda() if config['use_warped'] and config['use_temporal']: input_a = torch.cat((input_a, input_a), 0) input_b = torch.cat((curr_batch['input_b'].cuda(), curr_batch['input_temporal'].cuda()), 0) target = torch.cat((target, target), 0) elif config['use_temporal']: input_b = curr_batch['input_temporal'].cuda() elif config['use_warped']: input_b = curr_batch['input_b'].cuda() output_dict = model(input_a, input_b) output_recon = output_dict['reconstruction'] loss_vgg = loss_G_GAN = loss_G_feat = loss_l2 = 0 if config['use_vgg']: loss_vgg = criterionVGG(output_recon, target) * config['vgg_lambda'] if config['use_gan']: predicted_landmarks = output_dict['input_a_gauss_maps'] # output_dict['reconstruction'] can be considered normalized loss_G_GAN, loss_D_real, loss_D_fake = apply_GAN_criterion( output_recon, target, predicted_landmarks.detach(), discriminator, criterionGAN) loss_D = (loss_D_fake + loss_D_real) * 0.5 if config['use_l2']: loss_l2 = criterionMSE(output_recon, target) * config['l2_lambda'] loss_G = loss_G_GAN + loss_G_feat + loss_vgg + loss_l2 loss_G.backward() # grad_norm clipping if not config['no_grad_clip']: torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer_G.step() if config['use_gan']: optimizer_D.zero_grad() loss_D.backward() # grad_norm clipping if not config['no_grad_clip']: torch.nn.utils.clip_grad_norm_(discriminator.parameters(), 1.0) optimizer_D.step() if distributed: if config['use_vgg']: loss_vgg = reduce_tensor(loss_vgg, config['world_size']) if rank == 0: if cnt % 10 == 0: run_visualization(output_dict, output_recon, target, input_a, input_b, save_path, tb_logger, cnt) print_dict = {"learning_rate": get_learning_rate(optimizer_G)} if config['use_vgg']: tb_logger.add_scalar('vgg.loss', loss_vgg, cnt) print_dict['Loss_VGG'] = loss_vgg.data if config['use_gan']: tb_logger.add_scalar('gan.loss', loss_G_GAN, cnt) tb_logger.add_scalar('d_real.loss', loss_D_real, cnt) tb_logger.add_scalar('d_fake.loss', loss_D_fake, cnt) print_dict['Loss_G_GAN'] = loss_G_GAN print_dict['Loss_real'] = loss_D_real.data print_dict['Loss_fake'] = loss_D_fake.data if config['use_l2']: tb_logger.add_scalar('l2.loss', loss_l2, cnt) print_dict['Loss_L2'] = loss_l2.data log_iter(ep, cnt % len(train_dataloader), len(train_dataloader), print_dict, log_handle=log_handle) if loss_G != loss_G: print("NaN!!") exit(-2) cnt = cnt + 1 # end of train iter loop if cnt % config['val_freq'] == 0 and config['val_freq'] > 0: val_loss = run_val( model, validationLoss, val_dataloader, os.path.join(save_path, 'val_%d_renders' % (ep))) if distributed: val_loss = reduce_tensor(val_loss, config['world_size']) if rank == 0: tb_logger.add_scalar('validation.loss', val_loss, cnt) log_iter(ep, cnt % len(train_dataloader), len(train_dataloader), {"Loss_VGG": val_loss}, header="Validation loss: ", log_handle=log_handle) if rank == 0: if (ep % config['save_freq'] == 0): fname = 'checkpoint_%d.ckpt' % (ep) fname = os.path.join(save_path, fname) print("Saving model...") save_weights(model, fname, distributed) optimizer_g_fname = os.path.join( save_path, 'latest_optimizer_g_state.ckpt') torch.save(optimizer_G.state_dict(), optimizer_g_fname) if config['use_gan']: fname = 'checkpoint_D_%d.ckpt' % (ep) fname = os.path.join(save_path, fname) save_weights(discriminator, fname, distributed) optimizer_d_fname = os.path.join( save_path, 'latest_optimizer_d_state.ckpt') torch.save(optimizer_D.state_dict(), optimizer_d_fname)
class Distiller: def __init__(self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module): logger.info('Initializing Distiller') self.params = params self.dump_path = params.dump_path self.multi_gpu = params.multi_gpu self.fp16 = params.fp16 self.student = student self.teacher = teacher self.student_config = student.config self.vocab_size = student.config.vocab_size if params.n_gpu <= 1: sampler = RandomSampler(dataset) else: sampler = DistributedSampler(dataset) if params.group_by_size: groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size) sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size) else: sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False) self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences) self.temperature = params.temperature assert self.temperature > 0. self.alpha_ce = params.alpha_ce self.alpha_mlm = params.alpha_mlm self.alpha_clm = params.alpha_clm self.alpha_mse = params.alpha_mse self.alpha_cos = params.alpha_cos self.mlm = params.mlm if self.mlm: logger.info(f'Using MLM loss for LM step.') self.mlm_mask_prop = params.mlm_mask_prop assert 0.0 <= self.mlm_mask_prop <= 1.0 assert params.word_mask + params.word_keep + params.word_rand == 1.0 self.pred_probs = torch.FloatTensor( [params.word_mask, params.word_keep, params.word_rand]) self.pred_probs = self.pred_probs.to( f'cuda:{params.local_rank}' ) if params.n_gpu > 0 else self.pred_probs self.token_probs = token_probs.to( f'cuda:{params.local_rank}' ) if params.n_gpu > 0 else token_probs if self.fp16: self.pred_probs = self.pred_probs.half() self.token_probs = self.token_probs.half() else: logger.info(f'Using CLM loss for LM step.') self.epoch = 0 self.n_iter = 0 self.n_total_iter = 0 self.n_sequences_epoch = 0 self.total_loss_epoch = 0 self.last_loss = 0 self.last_loss_ce = 0 self.last_loss_mlm = 0 self.last_loss_clm = 0 if self.alpha_mse > 0.: self.last_loss_mse = 0 if self.alpha_cos > 0.: self.last_loss_cos = 0 self.last_log = 0 self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean') self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1) if self.alpha_mse > 0.: self.mse_loss_fct = nn.MSELoss(reduction='sum') if self.alpha_cos > 0.: self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction='mean') logger.info('--- Initializing model optimizer') assert params.gradient_accumulation_steps >= 1 self.num_steps_epoch = len(self.dataloader) num_train_optimization_steps = int( self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1 no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': params.weight_decay }, { 'params': [ p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0 }] logger.info( "------ Number of trainable parameters (student): %i" % sum([ p.numel() for p in self.student.parameters() if p.requires_grad ])) logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()])) self.optimizer = AdamW(optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)) warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop) self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps) if self.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) logger.info( f"Using fp16 training: {self.params.fp16_opt_level} level") self.student, self.optimizer = amp.initialize( self.student, self.optimizer, opt_level=self.params.fp16_opt_level) self.teacher = self.teacher.half() if self.multi_gpu: if self.fp16: from apex.parallel import DistributedDataParallel logger.info( "Using apex.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel(self.student) else: from torch.nn.parallel import DistributedDataParallel logger.info( "Using nn.parallel.DistributedDataParallel for distributed training." ) self.student = DistributedDataParallel( self.student, device_ids=[params.local_rank], output_device=params.local_rank, find_unused_parameters=True) self.is_master = params.is_master if self.is_master: logger.info('--- Initializing Tensorboard') self.tensorboard = SummaryWriter( log_dir=os.path.join(self.dump_path, 'log', 'train')) self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0) self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0) def prepare_batch_mlm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM. Input: ------ batch: `Tuple` token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded. lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch. Output: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) bs, max_seq_len = token_ids.size() mlm_labels = token_ids.new(token_ids.size()).copy_(token_ids) x_prob = self.token_probs[token_ids.flatten()] n_tgt = math.ceil(self.mlm_mask_prop * lengths.sum().item()) tgt_ids = torch.multinomial(x_prob / x_prob.sum(), n_tgt, replacement=False) pred_mask = torch.zeros( bs * max_seq_len, dtype=torch.bool, device=token_ids.device ) # previously `dtype=torch.uint8`, cf pytorch 1.2.0 compatibility pred_mask[tgt_ids] = 1 pred_mask = pred_mask.view(bs, max_seq_len) pred_mask[token_ids == self.params.special_tok_ids['pad_token']] = 0 # mask a number of words == 0 [8] (faster with fp16) if self.fp16: n1 = pred_mask.sum().item() if n1 > 8: pred_mask = pred_mask.view(-1) n2 = max(n1 % 8, 8 * (n1 // 8)) if n2 != n1: pred_mask[torch.nonzero(pred_mask).view(-1)[:n1 - n2]] = 0 pred_mask = pred_mask.view(bs, max_seq_len) assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item() _token_ids_real = token_ids[pred_mask] _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size) _token_ids_mask = _token_ids_real.clone().fill_( self.params.special_tok_ids['mask_token']) probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True) _token_ids = _token_ids_mask * ( probs == 0).long() + _token_ids_real * ( probs == 1).long() + _token_ids_rand * (probs == 2).long() token_ids = token_ids.masked_scatter(pred_mask, _token_ids) mlm_labels[ ~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, mlm_labels def prepare_batch_clm(self, batch): """ Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM. Input: ------ batch: `Tuple` token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded. lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch. Output: ------- token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM. attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention. clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict. """ token_ids, lengths = batch token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths) assert token_ids.size(0) == lengths.size(0) attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]) clm_labels = token_ids.new(token_ids.size()).copy_(token_ids) clm_labels[ ~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility # sanity checks assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size return token_ids, attn_mask, clm_labels def round_batch(self, x: torch.tensor, lengths: torch.tensor): """ For float16 only. Sub-sample sentences in a batch, and add padding, so that each dimension is a multiple of 8. Input: ------ x: `torch.tensor(bs, seq_length)` - The token ids. lengths: `torch.tensor(bs, seq_length)` - The lengths of each of the sequence in the batch. Output: ------- x: `torch.tensor(new_bs, new_seq_length)` - The updated token ids. lengths: `torch.tensor(new_bs, new_seq_length)` - The updated lengths. """ if not self.fp16 or len(lengths) < 8: return x, lengths # number of sentences == 0 [8] bs1 = len(lengths) bs2 = 8 * (bs1 // 8) assert bs2 > 0 and bs2 % 8 == 0 if bs1 != bs2: idx = torch.randperm(bs1)[:bs2] lengths = lengths[idx] slen = lengths.max().item() x = x[idx, :slen] else: idx = None # sequence length == 0 [8] ml1 = x.size(1) if ml1 % 8 != 0: pad = 8 - (ml1 % 8) ml2 = ml1 + pad if self.mlm: pad_id = self.params.special_tok_ids['pad_token'] else: pad_id = self.params.special_tok_ids['unk_token'] padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id) x = torch.cat([x, padding_tensor], 1) assert x.size() == (bs2, ml2) assert x.size(0) % 8 == 0 assert x.size(1) % 8 == 0 return x, lengths def train(self): """ The real training loop. """ if self.is_master: logger.info('Starting training') self.last_log = time.time() self.student.train() self.teacher.eval() for _ in range(self.params.n_epoch): if self.is_master: logger.info( f'--- Starting epoch {self.epoch}/{self.params.n_epoch-1}') if self.multi_gpu: torch.distributed.barrier() iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0]) for batch in iter_bar: if self.params.n_gpu > 0: batch = tuple( t.to(f'cuda:{self.params.local_rank}') for t in batch) if self.mlm: token_ids, attn_mask, lm_labels = self.prepare_batch_mlm( batch=batch) else: token_ids, attn_mask, lm_labels = self.prepare_batch_clm( batch=batch) self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels) iter_bar.update() iter_bar.set_postfix({ 'Last_loss': f'{self.last_loss:.2f}', 'Avg_cum_loss': f'{self.total_loss_epoch/self.n_iter:.2f}' }) iter_bar.close() if self.is_master: logger.info( f'--- Ending epoch {self.epoch}/{self.params.n_epoch-1}') self.end_epoch() if self.is_master: logger.info(f'Save very last checkpoint as `pytorch_model.bin`.') self.save_checkpoint(checkpoint_name=f'pytorch_model.bin') logger.info('Training is finished') def step(self, input_ids: torch.tensor, attention_mask: torch.tensor, lm_labels: torch.tensor): """ One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation), and possibly a parameter update (depending on the gradient accumulation). Input: ------ input_ids: `torch.tensor(bs, seq_length)` - The token ids. attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention. lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM). """ if self.mlm: s_logits, s_hidden_states = self.student( input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=attention_mask ) # (bs, seq_length, voc_size) else: s_logits, _, s_hidden_states = self.student( input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) with torch.no_grad(): t_logits, _, t_hidden_states = self.teacher( input_ids=input_ids, attention_mask=None) # (bs, seq_length, voc_size) assert s_logits.size() == t_logits.size() #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100 #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2 if self.params.restrict_ce_to_mask: mask = (lm_labels > -1).unsqueeze(-1).expand_as( s_logits) # (bs, seq_lenth, voc_size) else: mask = attention_mask.unsqueeze(-1).expand_as( s_logits) # (bs, seq_lenth, voc_size) s_logits_slct = torch.masked_select( s_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask s_logits_slct = s_logits_slct.view(-1, s_logits.size( -1)) # (bs * seq_length, voc_size) modulo the 1s in mask t_logits_slct = torch.masked_select( t_logits, mask) # (bs * seq_length * voc_size) modulo the 1s in mask t_logits_slct = t_logits_slct.view(-1, s_logits.size( -1)) # (bs * seq_length, voc_size) modulo the 1s in mask assert t_logits_slct.size() == s_logits_slct.size() loss_ce = self.ce_loss_fct( F.log_softmax(s_logits_slct / self.temperature, dim=-1), F.softmax(t_logits_slct / self.temperature, dim=-1)) * (self.temperature)**2 loss = self.alpha_ce * loss_ce if self.alpha_mlm > 0.: loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1)) loss += self.alpha_mlm * loss_mlm if self.alpha_clm > 0.: shift_logits = s_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_clm = self.lm_loss_fct( shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) loss += self.alpha_clm * loss_clm if self.alpha_mse > 0.: loss_mse = self.mse_loss_fct( s_logits_slct, t_logits_slct) / s_logits_slct.size( 0) # Reproducing batchmean reduction loss += self.alpha_mse * loss_mse if self.alpha_cos > 0.: s_hidden_states = s_hidden_states[-1] # (bs, seq_length, dim) t_hidden_states = t_hidden_states[-1] # (bs, seq_length, dim) mask = attention_mask.unsqueeze(-1).expand_as( s_hidden_states) # (bs, seq_length, dim) assert s_hidden_states.size() == t_hidden_states.size() dim = s_hidden_states.size(-1) s_hidden_states_slct = torch.masked_select( s_hidden_states, mask) # (bs * seq_length * dim) s_hidden_states_slct = s_hidden_states_slct.view( -1, dim) # (bs * seq_length, dim) t_hidden_states_slct = torch.masked_select( t_hidden_states, mask) # (bs * seq_length * dim) t_hidden_states_slct = t_hidden_states_slct.view( -1, dim) # (bs * seq_length, dim) target = s_hidden_states_slct.new( s_hidden_states_slct.size(0)).fill_(1) # (bs * seq_length,) loss_cos = self.cosine_loss_fct(s_hidden_states_slct, t_hidden_states_slct, target) loss += self.alpha_cos * loss_cos self.total_loss_epoch += loss.item() self.last_loss = loss.item() self.last_loss_ce = loss_ce.item() if self.alpha_mlm > 0.: self.last_loss_mlm = loss_mlm.item() if self.alpha_clm > 0.: self.last_loss_clm = loss_clm.item() if self.alpha_mse > 0.: self.last_loss_mse = loss_mse.item() if self.alpha_cos > 0.: self.last_loss_cos = loss_cos.item() self.optimize(loss) self.n_sequences_epoch += input_ids.size(0) def optimize(self, loss): """ Normalization on the loss (gradient accumulation or distributed training), followed by backward pass on the loss, possibly followed by a parameter update (depending on the gradient accumulation). Also update the metrics for tensorboard. """ # Check for NaN if (loss != loss).data.any(): logger.error('NaN detected') exit() if self.multi_gpu: loss = loss.mean() if self.params.gradient_accumulation_steps > 1: loss = loss / self.params.gradient_accumulation_steps if self.fp16: from apex import amp with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() self.iter() if self.n_iter % self.params.gradient_accumulation_steps == 0: if self.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.params.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.student.parameters(), self.params.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() self.scheduler.step() def iter(self): """ Update global counts, write to tensorboard and save checkpoint. """ self.n_iter += 1 self.n_total_iter += 1 if self.n_total_iter % self.params.log_interval == 0: self.log_tensorboard() self.last_log = time.time() if self.n_total_iter % self.params.checkpoint_interval == 0: self.save_checkpoint() def log_tensorboard(self): """ Log into tensorboard. Only by the master process. """ if not self.is_master: return for param_name, param in self.student.named_parameters(): self.tensorboard.add_scalar(tag='parameter_mean/' + param_name, scalar_value=param.data.mean(), global_step=self.n_total_iter) self.tensorboard.add_scalar(tag='parameter_std/' + param_name, scalar_value=param.data.std(), global_step=self.n_total_iter) if param.grad is None: continue self.tensorboard.add_scalar(tag="grad_mean/" + param_name, scalar_value=param.grad.data.mean(), global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="grad_std/" + param_name, scalar_value=param.grad.data.std(), global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="losses/cum_avg_loss_epoch", scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="losses/loss", scalar_value=self.last_loss, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter) if self.alpha_mlm > 0.: self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter) if self.alpha_clm > 0.: self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter) if self.alpha_mse > 0.: self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter) if self.alpha_cos > 0.: self.tensorboard.add_scalar(tag="losses/loss_cos", scalar_value=self.last_loss_cos, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="learning_rate/lr", scalar_value=self.scheduler.get_lr()[0], global_step=self.n_total_iter) self.tensorboard.add_scalar( tag="global/memory_usage", scalar_value=psutil.virtual_memory()._asdict()['used'] / 1_000_000, global_step=self.n_total_iter) self.tensorboard.add_scalar(tag="global/speed", scalar_value=time.time() - self.last_log, global_step=self.n_total_iter) def end_epoch(self): """ Finally arrived at the end of epoch (full pass on dataset). Do some tensorboard logging and checkpoint saving. """ logger.info( f'{self.n_sequences_epoch} sequences have been trained during this epoch.' ) if self.is_master: self.save_checkpoint( checkpoint_name=f'model_epoch_{self.epoch}.pth') self.tensorboard.add_scalar(tag='epoch/loss', scalar_value=self.total_loss_epoch / self.n_iter, global_step=self.epoch) self.epoch += 1 self.n_sequences_epoch = 0 self.n_iter = 0 self.total_loss_epoch = 0 def save_checkpoint(self, checkpoint_name: str = 'checkpoint.pth'): """ Save the current state. Only by the master process. """ if not self.is_master: return mdl_to_save = self.student.module if hasattr( self.student, 'module') else self.student mdl_to_save.config.save_pretrained(self.dump_path) state_dict = mdl_to_save.state_dict() torch.save(state_dict, os.path.join(self.dump_path, checkpoint_name))
def main(): parser = argparse.ArgumentParser( description='Descriptor Generator for PyTorch ImageNet Example') parser.add_argument( 'data', metavar='DIR', nargs='*', help='path(s) to dataset (if one path is provided, it is assumed\n' + 'to have subdirectories named "train" and "val"; alternatively,\n' + 'train and val paths can be specified directly by providing both paths as arguments)' ) parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18', choices=model_names, help='model architecture: ' + ' | '.join(model_names) + ' (default: resnet18)') parser.add_argument('-j', '--workers', default=4, type=int, metavar='N', help='number of data loading workers (default: 4)') parser.add_argument( '-bs', '--batch-size', default=256, type=int, metavar='N', help='batch size for descriptor generation (default: 256)') parser.add_argument('-p', '--print-freq', default=50, type=int, metavar='N', help='print frequency (default: 50)') #parser.add_argument('--evaluate', dest='evaluate', action='store_true', # help='evaluate model on validation set') parser.add_argument('--fp16', action='store_true', help='Run model fp16 mode.') parser.add_argument('--dali_cpu', action='store_true', help='Runs CPU based version of DALI pipeline.') parser.add_argument( '--static-loss-scale', type=float, default=1, help= 'Static loss scale, positive power of 2 values can improve fp16 convergence.' ) parser.add_argument( '--dynamic-loss-scale', action='store_true', help='Use dynamic loss scaling. If supplied, this argument supersedes ' + '--static-loss-scale.') parser.add_argument('--prof', dest='prof', action='store_true', help='Only run 10 iterations for profiling.') parser.add_argument('-t', '--test', action='store_true', help='Launch test mode with preset arguments') parser.add_argument("--local_rank", default=0, type=int) # added parser.add_argument('-ts', '--train-size', type=int, default=0, metavar='N', help='number of examples for training (default: 0)') parser.add_argument( '-ir', '--imbalance-ratio', type=int, default=1, metavar='N', help= 'ratio of 0..499 to 500..999 labels in the training dataset drawn from uniform distribution' ) parser.add_argument( '-nr', '--noisy-ratio', type=float, default=0.0, metavar='N', help= 'ratio of noisy(random) labels in the training dataset drawn from uniform distribution' ) parser.add_argument( '-ens', '--ensemble-size', type=int, default=1, metavar='E', help='defines size of ensemble or, by default, no ensemble if = 1') parser.add_argument('-e', '--ensemble-index', type=int, default=0, metavar='E', help='defines index of ensemble') parser.add_argument('--save-folder', default='../local_data/ImageNet', type=str, help='dir to save data') parser.add_argument('-r', '--run-folder', default='run99', type=str, help='dir to save run') parser.add_argument('-b', '--batch', type=int, default=0, metavar='N', help='augmentation batch (iteration) (default: 0)') parser.add_argument( '-sub', '--subtype-method', type=str, default='grad', metavar='N', help='method to generate gradient information (default: grad)') parser.add_argument('-aug', '--augment-method', type=str, default='random', metavar='N', help='method to match distributions (default: random)') parser.add_argument('-smp', '--sample-steps', type=int, default=1, metavar='N', help='number of samples for estimation (default: 1)') parser.add_argument('-dl', '--descriptor-length', type=int, default=0, metavar='L', help='descriptor length (default: 0)') parser.add_argument( '-unsup', '--unsupervised', type=int, default=0, help='unsupervised pretraining as initial step or random weights') args = parser.parse_args() cudnn.benchmark = True # test mode, use default args for sanity test if args.test: args.fp16 = False args.arch = 'resnet18' args.batch_size = 256 args.data = [] args.prof = True args.data.append('/data/imagenet/train-jpeg/') args.data.append('/data/imagenet/val-jpeg/') if not len(args.data): raise Exception("error: too few data arguments") args.distributed = False if 'WORLD_SIZE' in os.environ: args.distributed = int(os.environ['WORLD_SIZE']) > 1 args.gpu = 0 args.world_size = 1 if args.distributed: args.gpu = args.local_rank % torch.cuda.device_count() torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." if args.static_loss_scale != 1.0: if not args.fp16: print( "Warning: if --fp16 is not used, static_loss_scale will be ignored." ) # Data loading code if len(args.data) == 1: train_dir = os.path.join(args.data[0], 'train') val_dir = os.path.join(args.data[0], 'val') else: train_dir = args.data[0] val_dir = args.data[1] if (args.arch == "inception_v3"): crop_size = 299 val_size = 320 # I chose this value arbitrarily, we can adjust. else: crop_size = 224 val_size = 256 # lists for full datasets val_list_file = '{}/{}'.format(args.save_folder, 'processed/val_list.txt') if (args.imbalance_ratio == 1) and (args.noisy_ratio == 0.0): # use original training dataset train_list_file = '{}/{}'.format(args.save_folder, 'processed/train_list.txt') else: train_list_file = '{}/{}/full_train_list_ir_{}_nr_{}.txt'.format( args.save_folder, args.run_folder, args.imbalance_ratio, args.noisy_ratio) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=train_dir, file_list=train_list_file, crop=crop_size, local_rank=args.local_rank, world_size=args.world_size, size=val_size) pipe.build() train_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) pipe = HybridValPipe(batch_size=args.batch_size, num_threads=args.workers, device_id=args.local_rank, data_dir=val_dir, file_list=val_list_file, crop=crop_size, local_rank=args.local_rank, world_size=args.world_size, size=val_size) pipe.build() val_loader = DALIClassificationIterator( pipe, size=int(pipe.epoch_size("Reader") / args.world_size)) # create model print("=> creating model '{} {}'".format(args.arch, args.descriptor_length)) if 'MC' in args.subtype_method: model = models.__dict__[args.arch](L=args.descriptor_length, MC=True) else: model = models.__dict__[args.arch](L=args.descriptor_length, MC=False) # model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: # shared param/delay all reduce turns off bucketing in DDP, for lower latency runs this can improve perf # for the older version of APEX please use shared_param, for newer one it is delay_allreduce model = DDP(model, delay_allreduce=True) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), lr=1e-0) if args.fp16: optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.static_loss_scale, dynamic_loss_scale=args.dynamic_loss_scale) # load checkpoint model_folder = '{}/{}/checkpoint'.format(args.save_folder, args.run_folder) assert os.path.isdir( model_folder), 'Error: no model checkpoint directory found!' # if args.unsupervised == 1: unsup_prefix = 'unsup_' else: unsup_prefix = '' # descr_postfix = '{}batch_B_ir_{}_nr_{}_sub_{}_aug_{}_L_{}'.format( unsup_prefix, # we do not save descriptors for each iteration here due to large size args.imbalance_ratio, args.noisy_ratio, args.subtype_method, args.augment_method, args.descriptor_length) if args.batch == 0: model_postfix = '{}batch_{}_ir_{}_nr_{}_sub_{}_aug_{}'.format( unsup_prefix, args.batch, args.imbalance_ratio, args.noisy_ratio, 'none', 'none') if args.ensemble_size > 1: checkpoint_file = '{}/init_{}_E_{}.pt'.format( model_folder, model_postfix, args.ensemble_index) else: checkpoint_file = '{}/init_{}.pt'.format(model_folder, model_postfix) else: model_postfix = '{}batch_{}_size_{}_ir_{}_nr_{}_sub_{}_aug_{}_L_{}'.format( unsup_prefix, args.batch, args.train_size, args.imbalance_ratio, args.noisy_ratio, args.subtype_method, args.augment_method, args.descriptor_length) if args.ensemble_size > 1: checkpoint_file = '{}/best_{}_E_{}.pt'.format( model_folder, model_postfix, args.ensemble_index) else: checkpoint_file = '{}/best_{}.pt'.format(model_folder, model_postfix) # print('Generating descriptors using model checkpoint:', checkpoint_file) if os.path.isfile(checkpoint_file): checkpoint = torch.load(checkpoint_file) #print('TRANSFER', checkpoint['state_dict'].items()) if args.batch == 0 and args.unsupervised == 1: model.load_state_dict( { k: v for k, v in checkpoint['state_dict'].items() if 'fc' not in k }, strict=False) # copy all but last linear layer! else: model.load_state_dict(checkpoint['state_dict']) else: print('Some files are missing in gen_descr.py!') sys.exit(0) # val_prefix = 'val' train_prefix = 'train' if args.ensemble_size > 1: descr_val_file = '{}/{}/descr/{}_{}_E_{}.pt'.format( args.save_folder, args.run_folder, val_prefix, descr_postfix, args.ensemble_index) descr_train_file = '{}/{}/descr/{}_{}_E_{}.pt'.format( args.save_folder, args.run_folder, train_prefix, descr_postfix, args.ensemble_index) else: descr_val_file = '{}/{}/descr/{}_{}.pt'.format(args.save_folder, args.run_folder, val_prefix, descr_postfix) descr_train_file = '{}/{}/descr/{}_{}.pt'.format( args.save_folder, args.run_folder, train_prefix, descr_postfix) # if 'MC' in args.subtype_method: print('Generating train MC') with torch.no_grad(): gen_mc(args, train_loader, model, criterion, optimizer, train_prefix, descr_train_file) else: print('Generating val descriptors') gen_descr(args, val_loader, model, criterion, optimizer, val_prefix, descr_val_file, descr_val_file) print('Generating train descriptors') gen_descr(args, train_loader, model, criterion, optimizer, train_prefix, descr_train_file, descr_val_file)
def main(): global best_prec1, args args.distributed = args.world_size > 1 args.gpu = 0 if args.distributed: args.gpu = args.rank % torch.cuda.device_count() if args.distributed: torch.cuda.set_device(args.gpu) dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) if args.fp16: assert torch.backends.cudnn.enabled, "fp16 mode requires cudnn backend to be enabled." # create model if args.pretrained: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True, num_classes=args.num_classes) else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch](num_classes=args.num_classes) model = model.cuda() if args.fp16: model = network_to_half(model) if args.distributed: model = DDP(model) global model_params, master_params if args.fp16: model_params, master_params = prep_param_lists(model) else: master_params = list(model.parameters()) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(master_params, args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume, map_location = lambda storage, loc: storage.cuda(args.gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading code traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = traindir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator train_loader = DALIClassificationIterator(pipe, size = int(1281167 / args.world_size) ) pipe = HybridPipe(batch_size=args.batch_size, num_threads=args.workers, device_id = args.rank, data_dir = valdir) pipe.build() test_run = pipe.run() from nvidia.dali.plugin.pytorch import DALIClassificationIterator val_loader = DALIClassificationIterator(pipe, size = int(50000 / args.world_size) ) if args.evaluate: validate(val_loader, model, criterion) return for epoch in range(args.start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) # train for one epoch train(train_loader, model, criterion, optimizer, epoch) if args.prof: break # evaluate on validation set prec1 = validate(val_loader, model, criterion) # remember best prec@1 and save checkpoint if args.rank == 0: is_best = prec1 > best_prec1 best_prec1 = max(prec1, best_prec1) save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer' : optimizer.state_dict(), }, is_best)