def main(): parser = argparse.ArgumentParser( description='FCOS Detector Training With Pytorch') parser.add_argument( '--dataset-style', type=str, required=True, help="style of dataset (supported are 'pascal-voc' and 'coco')") parser.add_argument('--dataset', required=True, help='dataset path') parser.add_argument( '--train-image-set', type=str, default="train", help='image set (annotation file basename for COCO) ' 'to use for training') parser.add_argument( '--val-image-set', type=str, default="val", help='image set (annotation file basename for COCO) ' 'to use for validation') parser.add_argument( '--val-dataset', default=None, help='separate validation dataset directory path') parser.add_argument( '--net-config', help="path to network architecture configuration file " "(take a look into 'preset' directory for the reference)") # Params for optimizer parser.add_argument( '--optimizer', default="ranger", help="optimizer to use ('sgd', 'diffgrad', 'adamw', or 'ranger')") parser.add_argument( '--lr', '--learning-rate', default=1e-3, type=float, help='initial learning rate') parser.add_argument( '--momentum', default=0.9, type=float, help='optional momentum for SGD optimizer (default is 0.9)') parser.add_argument( '--weight-decay', default=5e-4, type=float, help='optional weight decay (L2 penalty) ' 'for SGD optimizer (default is 5e-4)') parser.add_argument('--backbone-pretrained', action='store_true') parser.add_argument( '--backbone-weights', help='pretrained weights for the backbone model') parser.add_argument('--freeze-backbone', action='store_true') # Scheduler parser.add_argument( '--scheduler', default="cosine-wr", type=str, help="scheduler for SGD. It can one of 'multi-step' and 'cosine-wr'") # Params for Scheduler parser.add_argument( '--milestones', default="70,100", type=str, help="milestones for MultiStepLR") parser.add_argument( '--t0', default=10, type=int, help='T_0 value for Cosine Annealing Warm Restarts.') parser.add_argument( '--t-mult', default=2, type=float, help='T_mult value for Cosine Annealing Warm Restarts.') # Train params parser.add_argument('--batch-size', default=32, type=int, help='batch size') parser.add_argument( '--num-epochs', default=120, type=int, help='number of epochs to train') parser.add_argument( '--num-workers', default=4, type=int, help='number of workers used in dataloading') parser.add_argument( '--val-epochs', default=5, type=int, help='perform validation every this many epochs') parser.add_argument( '--device', type=str, help='device to use for training') parser.add_argument( '--checkpoint-path', default='output', help='directory for saving checkpoint models') logging.basicConfig( stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') args = parser.parse_args() logging.info(args) if args.device is None: device = "cuda" if torch.cuda.is_available() else "cpu" else: device = args.device if device.startswith("cuda"): logging.info("Use CUDA") timer = Timer() arch = get_arch(args.net_config) bbox_format = dataset_bbox_format(args.dataset_style) train_mean, train_std = mean_std( args.dataset_style, args.dataset, args.train_image_set) train_transform = processing.train.Pipeline( [arch.image_size] * 2, train_mean, train_std, bbox_format=bbox_format) if args.val_dataset is not None: val_dataset_root = args.val_dataset else: val_dataset_root = args.dataset val_mean, val_std = mean_std( args.dataset_style, val_dataset_root, args.val_image_set) val_transform = processing.test.Pipeline( [arch.image_size] * 2, val_mean, val_std, bbox_format=bbox_format) logging.info("Loading datasets...") dataset = load_dataset( args.dataset_style, args.dataset, args.train_image_set, train_transform) num_classes = len(dataset.class_names) logging.info("Train dataset size: {}".format(len(dataset))) # don't allow the last batch be of length 1 # to not lead our dear BatchNorms to crash on that drop_last = len(dataset) % args.batch_size > 0 train_loader = DataLoader( dataset, args.batch_size, collate_fn=collate, num_workers=args.num_workers, shuffle=True, drop_last=drop_last) val_dataset = load_dataset( args.dataset_style, val_dataset_root, args.val_image_set, val_transform) logging.info("Validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader( val_dataset, args.batch_size, collate_fn=collate, num_workers=args.num_workers, shuffle=False, drop_last=drop_last) logging.info("Building network") backbone_pretrained = args.backbone_pretrained is not None net = arch.build(num_classes, backbone_pretrained, args.batch_size) if backbone_pretrained and args.backbone_weights is not None: logging.info(f"Load backbone weights from {args.backbone_weights}") timer.start("Loading backbone model") net.load_backbone_weights(args.backbone_weights) logging.info(f'Took {timer.end("Loading backbone model"):.2f}s.') if args.freeze_backbone: net.freeze_backbone() net.to(device) last_epoch = -1 criterion = arch.loss(net, device) mapper = arch.mapper(net, device) optim_kwargs = { "lr": args.lr, "weight_decay": args.weight_decay } if args.optimizer == "sgd": optim_class = torch.optim.SGD optim_kwargs.update({ "momentum": args.momentum }) elif args.optimizer == "adamw": optim_class = torch.optim.AdamW elif args.optimizer == "diffgrad": optim_class = DiffGrad else: optim_class = Ranger optimizer = optim_class(net.parameters(), **optim_kwargs) logging.info(f"Optimizer parameters used: {optim_kwargs}") if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR( optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) else: logging.info("Uses Cosine annealing warm restarts scheduler.") scheduler = CosineAnnealingWarmRestarts( optimizer, T_0=args.t0, T_mult=args.t_mult, eta_min=1e-5) os.makedirs(args.checkpoint_path, exist_ok=True) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, args.num_epochs): loop( train_loader, net, mapper, criterion, optimizer, device=device, epoch=epoch) scheduler.step() if (epoch > 0 and epoch % args.val_epochs == 0 or epoch == args.num_epochs - 1): val_loss = loop( val_loader, net, mapper, criterion, device=device, epoch=epoch) filename = f"{arch.name}-Epoch-{epoch}-Loss-{val_loss}.pth" model_path = os.path.join(args.checkpoint_path, filename) save(arch, net, dataset.class_names, model_path) logging.info(f"Saved model {model_path}")
generator = Generator() discriminator = Discriminator() generator_writer.add_graph(generator, [torch.rand([1, 1, 16384], dtype=torch.float32)]) discriminator_writer.add_graph(discriminator, [ torch.rand([1, 2, 16384], dtype=torch.float32), torch.rand([1, 2, 16384], dtype=torch.float32), ]) g_optimizer = Adam(generator.parameters(), cfg=cfg['hparas']['optim']) d_optimizer = Adam(discriminator.parameters(), cfg=cfg['hparas']['optim']) g_lr_change = CosineAnnealingWarmRestarts(optimizer=g_optimizer, T_0=10, T_mult=2, eta_min=0, last_epoch=-1) d_lr_change = CosineAnnealingWarmRestarts(optimizer=d_optimizer, T_0=10, T_mult=2, eta_min=0, last_epoch=-1) if not os.path.exists(SAVE_PATH): os.makedirs(SAVE_PATH) if cfg['hparas']['train_continue']: print('loading models ...') generator.load_state_dict( torch.load(
#------------------------------------------------------# # 主干特征提取网络特征通用,冻结训练可以加快训练速度 # 也可以在训练初期防止权值被破坏。 # Init_Epoch为起始epoch # Freeze_Epoch为冻结训练的epoch # Epoch总训练epoch # 提示OOM或者显存不足请调小Batch_size #------------------------------------------------------# if True: initial_lr = 1e-3 Init_Epoch = 15 Freeze_Epoch = 20 optimizer = optim.SGD(net.parameters(), lr=initial_lr) if Cosine_lr: lr_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) else: lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.92) # 取出放在run中的超参数 runs = runBuilder.get_runs(parameters) print(runs) train_set = get_train_set(classes_path, 'train') train_loader = DataLoader(train_set, batch_size=runs[0].batch_size, num_workers=runs[0].num_workers, shuffle=runs[0].shuffle) print("data successfully loaded!") # 冻结一定部分训练 print("start to freeze the backbone!")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--bert_model", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--from_pretrained", default="bert-base-uncased", type=str, help="Bert pre-trained model selected in the list: bert-base-uncased, " "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.", ) parser.add_argument( "--output_dir", default="save", type=str, help= "The output directory where the model checkpoints will be written.", ) parser.add_argument( "--config_file", default="config/bert_base_6layer_6conect.json", type=str, help="The config file which specified the model details.", ) parser.add_argument( "--num_train_epochs", default=20, type=int, help="Total number of training epochs to perform.", ) parser.add_argument( "--train_iter_multiplier", default=1.0, type=float, help="multiplier for the multi-task training.", ) parser.add_argument( "--train_iter_gap", default=4, type=int, help= "forward every n iteration is the validation score is not improving over the last 3 epoch, -1 means will stop", ) parser.add_argument( "--warmup_proportion", default=0.1, type=float, help= "Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.", ) parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available") parser.add_argument( "--do_lower_case", default=True, type=bool, help= "Whether to lower case the input text. True for uncased models, False for cased models.", ) parser.add_argument( "--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus", ) parser.add_argument("--seed", type=int, default=0, help="random seed for initialization") parser.add_argument( "--gradient_accumulation_steps", type=int, default=1, help= "Number of updates steps to accumualte before performing a backward/update pass.", ) parser.add_argument( "--fp16", action="store_true", help="Whether to use 16-bit float precision instead of 32-bit", ) parser.add_argument( "--loss_scale", type=float, default=0, help= "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n", ) parser.add_argument( "--num_workers", type=int, default=4, help="Number of workers in the dataloader.", ) parser.add_argument("--save_name", default="", type=str, help="save name for training.") parser.add_argument( "--in_memory", default=False, type=bool, help="whether use chunck for parallel training.", ) parser.add_argument("--optim", default="AdamW", type=str, help="what to use for the optimization.") parser.add_argument("--tasks", default="", type=str, help="1-2-3... training task separate by -") parser.add_argument( "--freeze", default=-1, type=int, help="till which layer of textual stream of vilbert need to fixed.", ) parser.add_argument( "--vision_scratch", action="store_true", help="whether pre-trained the image or not.", ) parser.add_argument("--evaluation_interval", default=1, type=int, help="evaluate very n epoch.") parser.add_argument( "--lr_scheduler", default="mannul", type=str, help="whether use learning rate scheduler.", ) parser.add_argument("--baseline", action="store_true", help="whether use single stream baseline.") parser.add_argument("--resume_file", default="", type=str, help="Resume from checkpoint") parser.add_argument( "--dynamic_attention", action="store_true", help="whether use dynamic attention.", ) parser.add_argument( "--clean_train_sets", default=True, type=bool, help="whether clean train sets for multitask data.", ) parser.add_argument( "--visual_target", default=0, type=int, help="which target to use for visual branch. \ 0: soft label, \ 1: regress the feature, \ 2: NCE loss.", ) parser.add_argument( "--task_specific_tokens", action="store_true", help="whether to use task specific tokens for the multi-task learning.", ) args = parser.parse_args() with open("vilbert_tasks.yml", "r") as f: task_cfg = edict(yaml.safe_load(f)) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if args.baseline: from pytorch_transformers.modeling_bert import BertConfig from vilbert.basebert import BaseBertForVLTasks else: from vilbert.vilbert import BertConfig from vilbert.vilbert import VILBertForVLTasks task_names = [] task_lr = [] for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id name = task_cfg[task]["name"] task_names.append(name) task_lr.append(task_cfg[task]["lr"]) base_lr = min(task_lr) loss_scale = {} for i, task_id in enumerate(args.tasks.split("-")): task = "TASK" + task_id loss_scale[task] = task_lr[i] / base_lr if args.save_name: prefix = "-" + args.save_name else: prefix = "" timeStamp = ("-".join(task_names) + "_" + args.config_file.split("/")[1].split(".")[0] + prefix) savePath = os.path.join(args.output_dir, timeStamp) bert_weight_name = json.load( open("config/" + args.bert_model + "_weight_name.json", "r")) if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") n_gpu = torch.cuda.device_count() else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 torch.distributed.init_process_group(backend="nccl") logger.info( "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}". format(device, n_gpu, bool(args.local_rank != -1), args.fp16)) default_gpu = False if dist.is_available() and args.local_rank != -1: rank = dist.get_rank() if rank == 0: default_gpu = True else: default_gpu = True if default_gpu: if not os.path.exists(savePath): os.makedirs(savePath) config = BertConfig.from_json_file(args.config_file) if default_gpu: # save all the hidden parameters. with open(os.path.join(savePath, "command.txt"), "w") as f: print(args, file=f) # Python 3.x print("\n", file=f) print(config, file=f) task_batch_size, task_num_iters, task_ids, task_datasets_train, task_datasets_val, task_dataloader_train, task_dataloader_val = LoadDatasets( args, task_cfg, args.tasks.split("-")) logdir = os.path.join(savePath, "logs") tbLogger = utils.tbLogger( logdir, savePath, task_names, task_ids, task_num_iters, args.gradient_accumulation_steps, ) if args.visual_target == 0: config.v_target_size = 1601 config.visual_target = args.visual_target else: config.v_target_size = 2048 config.visual_target = args.visual_target if args.task_specific_tokens: config.task_specific_tokens = True if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) task_ave_iter = {} task_stop_controller = {} for task_id, num_iter in task_num_iters.items(): task_ave_iter[task_id] = int(task_cfg[task]["num_epoch"] * num_iter * args.train_iter_multiplier / args.num_train_epochs) task_stop_controller[task_id] = utils.MultiTaskStopOnPlateau( mode="max", patience=1, continue_threshold=0.005, cooldown=1, threshold=0.001, ) task_ave_iter_list = sorted(task_ave_iter.values()) median_num_iter = task_ave_iter_list[-1] num_train_optimization_steps = (median_num_iter * args.num_train_epochs // args.gradient_accumulation_steps) num_labels = max( [dataset.num_labels for dataset in task_datasets_train.values()]) if args.dynamic_attention: config.dynamic_attention = True if "roberta" in args.bert_model: config.model = "roberta" if args.baseline: model = BaseBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) else: model = VILBertForVLTasks.from_pretrained( args.from_pretrained, config=config, num_labels=num_labels, default_gpu=default_gpu, ) task_losses = LoadLosses(args, task_cfg, args.tasks.split("-")) no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"] if args.freeze != -1: bert_weight_name_filtered = [] for name in bert_weight_name: if "embeddings" in name: bert_weight_name_filtered.append(name) elif "encoder" in name: layer_num = name.split(".")[2] if int(layer_num) <= args.freeze: bert_weight_name_filtered.append(name) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if key[12:] in bert_weight_name_filtered: value.requires_grad = False if default_gpu: print("filtered weight") print(bert_weight_name_filtered) optimizer_grouped_parameters = [] for key, value in dict(model.named_parameters()).items(): if value.requires_grad: if "vil_" in key: lr = 1e-4 else: if args.vision_scratch: if key[12:] in bert_weight_name: lr = base_lr else: lr = 1e-4 else: lr = base_lr if any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.0 }] if not any(nd in key for nd in no_decay): optimizer_grouped_parameters += [{ "params": [value], "lr": lr, "weight_decay": 0.01 }] if default_gpu: print(len(list(model.named_parameters())), len(optimizer_grouped_parameters)) if args.optim == "AdamW": optimizer = AdamW(optimizer_grouped_parameters, lr=base_lr, correct_bias=False) elif args.optim == "RAdam": optimizer = RAdam(optimizer_grouped_parameters, lr=base_lr) warmpu_steps = args.warmup_proportion * num_train_optimization_steps if args.lr_scheduler == "warmup_linear": warmup_scheduler = WarmupLinearSchedule( optimizer, warmup_steps=warmpu_steps, t_total=num_train_optimization_steps) else: warmup_scheduler = WarmupConstantSchedule(optimizer, warmup_steps=warmpu_steps) lr_reduce_list = np.array([5, 7]) if args.lr_scheduler == "automatic": lr_scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.2, patience=1, cooldown=1, threshold=0.001) elif args.lr_scheduler == "cosine": lr_scheduler = CosineAnnealingLR(optimizer, T_max=median_num_iter * args.num_train_epochs) elif args.lr_scheduler == "cosine_warm": lr_scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=median_num_iter * args.num_train_epochs) elif args.lr_scheduler == "mannul": def lr_lambda_fun(epoch): return pow(0.2, np.sum(lr_reduce_list <= epoch)) lr_scheduler = LambdaLR(optimizer, lr_lambda=lr_lambda_fun) startIterID = 0 global_step = 0 start_epoch = 0 if args.resume_file != "" and os.path.exists(args.resume_file): checkpoint = torch.load(args.resume_file, map_location="cpu") new_dict = {} for attr in checkpoint["model_state_dict"]: if attr.startswith("module."): new_dict[attr.replace( "module.", "", 1)] = checkpoint["model_state_dict"][attr] else: new_dict[attr] = checkpoint["model_state_dict"][attr] model.load_state_dict(new_dict) warmup_scheduler.load_state_dict( checkpoint["warmup_scheduler_state_dict"]) # lr_scheduler.load_state_dict(checkpoint['lr_scheduler_state_dict']) optimizer.load_state_dict(checkpoint["optimizer_state_dict"]) global_step = checkpoint["global_step"] start_epoch = int(checkpoint["epoch_id"]) + 1 task_stop_controller = checkpoint["task_stop_controller"] tbLogger = checkpoint["tb_logger"] del checkpoint model.to(device) for state in optimizer.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() if args.local_rank != -1: try: from apex.parallel import DistributedDataParallel as DDP except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training." ) model = DDP(model, delay_allreduce=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) if default_gpu: print("***** Running training *****") print(" Num Iters: ", task_num_iters) print(" Batch size: ", task_batch_size) print(" Num steps: %d" % num_train_optimization_steps) task_iter_train = {name: None for name in task_ids} task_count = {name: 0 for name in task_ids} for epochId in tqdm(range(start_epoch, args.num_train_epochs), desc="Epoch"): model.train() for step in range(median_num_iter): iterId = startIterID + step + (epochId * median_num_iter) first_task = True for task_id in task_ids: is_forward = False if (not task_stop_controller[task_id].in_stop) or ( iterId % args.train_iter_gap == 0): is_forward = True if is_forward: loss, score = ForwardModelsTrain( args, task_cfg, device, task_id, task_count, task_iter_train, task_dataloader_train, model, task_losses, ) loss = loss * loss_scale[task_id] if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps loss.backward() if (step + 1) % args.gradient_accumulation_steps == 0: if args.fp16: lr_this_step = args.learning_rate * warmup_linear( global_step / num_train_optimization_steps, args.warmup_proportion, ) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step if first_task and (global_step < warmpu_steps or args.lr_scheduler == "warmup_linear"): warmup_scheduler.step() optimizer.step() model.zero_grad() if first_task: global_step += 1 first_task = False if default_gpu: tbLogger.step_train( epochId, iterId, float(loss), float(score), optimizer.param_groups[0]["lr"], task_id, "train", ) if "cosine" in args.lr_scheduler and global_step > warmpu_steps: lr_scheduler.step() if (step % (20 * args.gradient_accumulation_steps) == 0 and step != 0 and default_gpu): tbLogger.showLossTrain() # decided whether to evaluate on each tasks. for task_id in task_ids: if (iterId != 0 and iterId % task_num_iters[task_id] == 0) or (epochId == args.num_train_epochs - 1 and step == median_num_iter - 1): evaluate( args, task_dataloader_val, task_stop_controller, task_cfg, device, task_id, model, task_losses, epochId, default_gpu, tbLogger, ) if args.lr_scheduler == "automatic": lr_scheduler.step(sum(val_scores.values())) logger.info("best average score is %3f" % lr_scheduler.best) elif args.lr_scheduler == "mannul": lr_scheduler.step() if epochId in lr_reduce_list: for task_id in task_ids: # reset the task_stop_controller once the lr drop task_stop_controller[task_id]._reset() if default_gpu: # Save a trained model logger.info("** ** * Saving fine - tuned model ** ** * ") model_to_save = ( model.module if hasattr(model, "module") else model ) # Only save the model it-self output_model_file = os.path.join( savePath, "pytorch_model_" + str(epochId) + ".bin") output_checkpoint = os.path.join(savePath, "pytorch_ckpt_latest.tar") torch.save(model_to_save.state_dict(), output_model_file) torch.save( { "model_state_dict": model_to_save.state_dict(), "optimizer_state_dict": optimizer.state_dict(), "warmup_scheduler_state_dict": warmup_scheduler.state_dict(), # 'lr_scheduler_state_dict': lr_scheduler.state_dict(), "global_step": global_step, "epoch_id": epochId, "task_stop_controller": task_stop_controller, "tb_logger": tbLogger, }, output_checkpoint, ) tbLogger.txt_close()
def run_training(data_type="screw", model_dir="models", epochs=256, pretrained=True, test_epochs=10, freeze_resnet=20, learninig_rate=0.03, optim_name="SGD", batch_size=64, head_layer=8): torch.multiprocessing.freeze_support() # TODO: use script params for hyperparameter # Temperature Hyperparameter currently not used temperature = 0.2 device = "cuda" weight_decay = 0.00003 momentum = 0.9 #TODO: use f strings also for the date LOL model_name = f"model-{data_type}" + '-{date:%Y-%m-%d_%H_%M_%S}'.format( date=datetime.datetime.now()) #augmentation: size = 256 min_scale = 0.5 # create Training Dataset and Dataloader after_cutpaste_transform = transforms.Compose([]) after_cutpaste_transform.transforms.append(transforms.ToTensor()) after_cutpaste_transform.transforms.append( transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])) train_transform = transforms.Compose([]) # train_transform.transforms.append(transforms.RandomResizedCrop(size, scale=(min_scale,1))) # train_transform.transforms.append(transforms.GaussianBlur(int(size/10), sigma=(0.1,2.0))) train_transform.transforms.append(transforms.Resize((256, 256))) train_transform.transforms.append( CutPaste(transform=after_cutpaste_transform)) # train_transform.transforms.append(transforms.ToTensor()) train_data = MVTecAT("Data", data_type, transform=train_transform, size=int(size * (1 / min_scale))) dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=8, collate_fn=cut_paste_collate_fn, persistent_workers=True, pin_memory=True, prefetch_factor=5) # Writer will output to ./runs/ directory by default writer = SummaryWriter(Path("logdirs") / model_name) # create Model: head_layers = [512] * head_layer + [128] print(head_layers) model = ProjectionNet(pretrained=pretrained, head_layers=head_layers) model.to(device) if freeze_resnet > 0: model.freeze_resnet() loss_fn = torch.nn.CrossEntropyLoss() if optim_name == "sgd": optimizer = optim.SGD(model.parameters(), lr=learninig_rate, momentum=momentum, weight_decay=weight_decay) scheduler = CosineAnnealingWarmRestarts(optimizer, epochs) #scheduler = None elif optim_name == "adam": optimizer = optim.Adam(model.parameters(), lr=learninig_rate, weight_decay=weight_decay) scheduler = None else: print(f"ERROR unkown optimizer: {optim_name}") step = 0 import torch.autograd.profiler as profiler num_batches = len(dataloader) def get_data_inf(): while True: for out in enumerate(dataloader): yield out dataloader_inf = get_data_inf() # From paper: "Note that, unlike conventional definition for an epoch, # we define 256 parameter update steps as one epoch. for step in tqdm(range(epochs * 256)): epoch = int(step / 256) if epoch == freeze_resnet: model.unfreeze() batch_embeds = [] batch_idx, data = next(dataloader_inf) x1, x2 = data x1 = x1.to(device) x2 = x2.to(device) # zero the parameter gradients optimizer.zero_grad() xc = torch.cat((x1, x2), axis=0) embeds, logits = model(xc) # embeds = F.normalize(embeds, p=2, dim=1) # embeds1, embeds2 = torch.split(embeds,x1.size(0),dim=0) # ip = torch.matmul(embeds1, embeds2.T) # ip = ip / temperature # y = torch.arange(0,x1.size(0), device=device) # loss = loss_fn(ip, torch.arange(0,x1.size(0), device=device)) y = torch.tensor([0, 1], device=device) y = y.repeat_interleave(x1.size(0)) loss = loss_fn(logits, y) # regulize weights: loss.backward() optimizer.step() if scheduler is not None: scheduler.step(epoch + batch_idx / num_batches) writer.add_scalar('loss', loss.item(), step) # predicted = torch.argmax(ip,axis=0) predicted = torch.argmax(logits, axis=1) # print(logits) # print(predicted) # print(y) accuracy = torch.true_divide(torch.sum(predicted == y), predicted.size(0)) writer.add_scalar('acc', accuracy, step) if scheduler is not None: writer.add_scalar('lr', scheduler.get_last_lr()[0], step) # save embed for validation: if test_epochs > 0 and epoch % test_epochs == 0: batch_embeds.append(embeds.cpu().detach()) writer.add_scalar('epoch', epoch, step) # run tests if test_epochs > 0 and epoch % test_epochs == 0: # run auc calculation #TODO: create dataset only once. #TODO: train predictor here or in the model class itself. Should not be in the eval part #TODO: we might not want to use the training datat because of droupout etc. but it should give a indecation of the model performance??? # batch_embeds = torch.cat(batch_embeds) # print(batch_embeds.shape) model.eval() roc_auc = eval_model(model_name, data_type, device=device, save_plots=False, size=size, show_training_data=False, model=model) #train_embed=batch_embeds) model.train() writer.add_scalar('eval_auc', roc_auc, step) torch.save(model.state_dict(), model_dir / f"{model_name}.tch")
def main(): args = get_args() # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m-%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format( local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True train_dataset, val_dataset = get_dataset('cifar100') train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, pin_memory=True) val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) model = mutableResNet20() logging.info('load model successfully') optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: model = nn.DataParallel(model) loss_function = criterion_smooth.cuda() device = torch.device("cuda") else: loss_function = criterion_smooth device = torch.device("cpu") # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, # lambda step: (1.0-step/args.total_iters) if step <= args.total_iters else 0, last_epoch=-1) scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( # optimizer, T_max=200) model = model.to(device) all_iters = 0 if args.auto_continue: # 自动进行?? lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) logging.info('load from checkpoint') for i in range(iters): scheduler.step() # 参数设置 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_loader = train_loader args.val_loader = val_loader if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) exit(0) # warmup weights if args.warmup > 0: logging.info("begin warmup weights") while all_iters < args.warmup: all_iters = train_supernet(model, device, args, bn_process=False, all_iters=all_iters) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader) while all_iters < args.total_iters: logging.info("=" * 50) all_iters = train_subnet(model, device, args, bn_process=False, all_iters=all_iters, arch_loader=arch_loader) if all_iters % 200 == 0: logging.info("validate iter {}".format(all_iters)) validate(model, device, args, all_iters=all_iters, arch_loader=arch_loader)
class Learner: def __init__(self, model, train_loader, valid_loader, config): self.config = config self.train_loader = train_loader self.valid_loader = valid_loader self.model = model.to(self.config.device) self.logger = init_logger(self.config.log_dir, 'train_main.log') self.tb_logger = init_tb_logger(self.config.log_dir, 'train_main') self.log('\n'.join( [f"{k} = {v}" for k, v in self.config.__dict__.items()])) self.summary_loss = AverageMeter() self.evaluator = Evaluator() self.criterion = torch.nn.CrossEntropyLoss( ignore_index=self.config.ignore_index) self.u_criterion = torch.nn.CrossEntropyLoss( ignore_index=self.config.ignore_index) train_params = [{ 'params': getattr(model, 'encoder').parameters(), 'lr': self.config.lr }, { 'params': getattr(model, 'decoder').parameters(), 'lr': self.config.lr * 10 }] self.optimizer = RAdam(train_params, weight_decay=self.config.weight_decay) self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, T_0=2, T_mult=2, eta_min=1e-6) self.n_ensemble = 0 self.epoch = 0 self.best_epoch = 0 self.best_loss = np.inf self.best_score = -np.inf def train_one_epoch(self): self.model.train() self.summary_loss.reset() iters = len(self.train_loader) for step, (images, scribbles, weights) in enumerate(self.train_loader): self.tb_logger.add_scalar('Train/lr', self.optimizer.param_groups[0]['lr'], iters * self.epoch + step) scribbles = scribbles.to(self.config.device).long() images = images.to(self.config.device) batch_size = images.shape[0] self.optimizer.zero_grad() outputs = self.model(images) if self.epoch < self.config.thr_epoch: loss = self.criterion(outputs, scribbles) else: x_loss = self.criterion(outputs, scribbles) scribbles = scribbles.cpu() mean = weights[..., 0] u_labels = torch.where( ((mean < (1 - self.config.thr_conf)) | (mean > self.config.thr_conf)) & (scribbles == self.config.ignore_index), mean.round().long(), self.config.ignore_index * torch.ones_like(scribbles)).to( self.config.device) u_loss = self.u_criterion(outputs, u_labels) loss = x_loss + 0.5 * u_loss loss.backward() self.summary_loss.update(loss.detach().item(), batch_size) self.optimizer.step() if self.scheduler.__class__.__name__ != 'ReduceLROnPlateau': self.scheduler.step() return self.summary_loss.avg def validation(self): self.model.eval() self.summary_loss.reset() self.evaluator.reset() for step, (_, images, _, targets) in enumerate(self.valid_loader): with torch.no_grad(): targets = targets.to(self.config.device).long() batch_size = images.shape[0] images = images.to(self.config.device) outputs = self.model(images) loss = self.criterion(outputs, targets) targets = targets.cpu().numpy() outputs = torch.argmax(outputs, dim=1) outputs = outputs.data.cpu().numpy() self.evaluator.add_batch(targets, outputs) self.summary_loss.update(loss.detach().item(), batch_size) if self.scheduler.__class__.__name__ == 'ReduceLROnPlateau': self.scheduler.step(self.evaluator.IoU) return self.summary_loss.avg, self.evaluator.IoU def ensemble_prediction(self): ds = self.train_loader.dataset transforms = Compose([Normalize(), ToTensorV2()]) for idx, images in tqdm(ds.images.items(), total=len(ds)): augmented = transforms(image=images['image']) img = augmented['image'].unsqueeze(0).to(self.config.device) with torch.no_grad(): pred = torch.nn.functional.softmax(self.model(img), dim=1) weight = torch.tensor(images['weight']) pred = pred.squeeze(0).cpu() x = pred[1] weight[..., 0] = self.config.alpha * x + ( 1 - self.config.alpha) * weight[..., 0] self.train_loader.dataset.images[idx]['weight'] = weight.numpy() self.n_ensemble += 1 def fit(self, epochs): for e in range(epochs): t = time.time() loss = self.train_one_epoch() self.log( f'[Train] \t Epoch: {self.epoch}, loss: {loss:.5f}, time: {(time.time() - t):.2f}' ) self.tb_log(loss, None, 'Train', self.epoch) t = time.time() loss, score = self.validation() self.log( f'[Valid] \t Epoch: {self.epoch}, loss: {loss:.5f}, IoU: {score:.4f}, time: {(time.time() - t):.2f}' ) self.tb_log(loss, score, 'Valid', self.epoch) self.post_processing(loss, score) if (self.epoch + 1) % self.config.period_epoch == 0: self.log( f'[Ensemble] \t the {self.n_ensemble}th Prediction Ensemble ...' ) self.ensemble_prediction() self.epoch += 1 self.log( f'best epoch: {self.best_epoch}, best loss: {self.best_loss}, best_score: {self.best_score}' ) def post_processing(self, loss, score): if loss < self.best_loss: self.best_loss = loss if score > self.best_score: self.best_score = score self.best_epoch = self.epoch self.model.eval() torch.save( { 'model_state_dict': self.model.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'scheduler_state_dict': self.scheduler.state_dict(), 'best_score': self.best_score, 'epoch': self.epoch, }, f'{os.path.join(self.config.log_dir, "best_model.pth")}') self.log(f'best model: {self.epoch} epoch - {score:.4f}') def load(self, path): checkpoint = torch.load(path) self.model.load_state_dict(checkpoint['model_state_dict']) self.optimizer.load_state_dict(checkpoint['optimizer_state_dict']) self.scheduler.load_state_dict(checkpoint['scheduler_state_dict']) self.best_score = checkpoint['best_score'] self.epoch = checkpoint['epoch'] + 1 def log(self, text): self.logger.info(text) def tb_log(self, loss, IoU, split, step): if loss: self.tb_logger.add_scalar(f'{split}/Loss', loss, step) if IoU: self.tb_logger.add_scalar(f'{split}/IoU', IoU, step)
class CreateModel(nn.Module): def __init__(self, args, class_num): super(CreateModel, self).__init__() self.args = args self.feature_dim = args.feature_dim self.device = args.device self.gpu_ids = args.gpu_ids ## Backbone if 'spherenet' in args.backbone: num_layers = int(args.backbone.split('spherenet')[-1]) self.backbone = getattr(networks, 'spherenet')( num_layers, args.feature_dim, args.image_size, args.double_depth, args.use_batchnorm, args.use_pool, args.use_dropout) elif 'mobilenet' in args.backbone: self.backbone = getattr(networks, 'MobileNetV2')(args.feature_dim) else: self.backbone = getattr(networks, args.backbone)(args.feature_dim, args.use_pool, args.use_dropout) self.backbone.to(self.device) ## Objective function self.criterion = getattr(losses, self.args.loss_type) self.criterion = self.criterion(class_num, self.args) self.criterion.to(self.device) self.model_names = ['backbone', 'criterion'] self.state_names = ['loss_ce', 'acc', 'lr'] def train_setup(self): ## Setup nn.DataParallel if necessary if self.device.type != 'cpu': if len(self.gpu_ids) > 1: self.backbone = nn.DataParallel(self.backbone) ## Setup optimizer self.lr = self.args.lr self.save_dir = os.path.join(self.args.checkpoints_dir, self.args.name) params = list(self.backbone.parameters()) + list( self.criterion.parameters()) self.optimizer = optim.SGD(params, lr=self.args.lr, momentum=0.9, weight_decay=5e-4) # self.scheduler = MultiStepLR(self.optimizer, milestones=self.args.decay_steps, gamma=0.5) # self.scheduler = CosineAnnealingLR(self.optimizer, self.args.epochs) self.scheduler = CosineAnnealingWarmRestarts(self.optimizer, 20, 1) ## Weight initialization self.backbone.apply(weights_init) self.criterion.apply(weights_init) ## Switch to training mode self.train() def update_learning_rate(self): self.scheduler.step() self.lr = self.optimizer.param_groups[0]['lr'] def optimize_parameters(self, input, target): # input, target = data input, target = input.to(self.device), target.to(self.device) self.score, self.loss_ce = self.forward(input, target) self.optimizer.zero_grad() self.loss_ce.backward() self.optimizer.step() _, pred_labels = torch.max(F.softmax(self.score, dim=1), 1) self.acc = torch.sum(torch.eq(pred_labels, target.view(-1))).item() / len(target) def get_current_states(self): errors_ret = OrderedDict() for name in self.state_names: if isinstance(name, str): # float(...) works for both scalar tensor and float number errors_ret[name] = float(getattr(self, name)) return errors_ret def save_networks(self, which_epoch): for name in self.model_names: if isinstance(name, str): save_filename = '%s_net_%s.pth' % (which_epoch, name) save_path = os.path.join(self.save_dir, save_filename) net = getattr(self, name) if self.gpu_ids and torch.cuda.is_available(): try: torch.save(net.module.cpu().state_dict(), save_path) except: torch.save(net.cpu().state_dict(), save_path) else: torch.save(net.cpu().state_dict(), save_path) net.to(self.device) def forward(self, input, target=None, is_feature=False): features = self.backbone(input) if is_feature: return features else: return self.criterion(features, target) def eval(self): for name in self.model_names: try: if isinstance(name, str): getattr(self, name).eval() except: print('{}.eval() cannot be implemented as {} does not exist.'. format(name, name)) def train(self): for name in self.model_names: try: if isinstance(name, str): getattr(self, name).train() except: print('{}.train() cannot be implemented as {} does not exist.'. format(name, name))
def main(): args = get_args() num_gpus = torch.cuda.device_count() args.gpu = args.local_rank % num_gpus torch.cuda.set_device(args.gpu) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.batch_size = args.batch_size // args.world_size # archLoader arch_loader = ArchLoader(args.path) # Log log_format = '[%(asctime)s] %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m-%d %I:%M:%S') t = time.time() local_time = time.localtime(t) if not os.path.exists('./log'): os.mkdir('./log') fh = logging.FileHandler( os.path.join('log/train-{}-{:02}-{:02}-{:.3f}'.format( local_time.tm_year % 2000, local_time.tm_mon, local_time.tm_mday, t))) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) use_gpu = False if torch.cuda.is_available(): use_gpu = True train_loader = get_train_loader(args.batch_size, args.local_rank, args.num_workers, args.total_iters) val_loader = get_val_loader(args.batch_size, args.num_workers) model = mutableResNet20() logging.info('load model successfully') optimizer = torch.optim.SGD(get_parameters(model), lr=args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) criterion_smooth = CrossEntropyLabelSmooth(1000, 0.1) if use_gpu: # model = nn.DataParallel(model) model = model.cuda(args.gpu) model = nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) loss_function = criterion_smooth.cuda() else: loss_function = criterion_smooth scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) all_iters = 0 if args.auto_continue: # 自动进行?? lastest_model, iters = get_lastest_model() if lastest_model is not None: all_iters = iters checkpoint = torch.load(lastest_model, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint['state_dict'], strict=True) logging.info('load from checkpoint') for i in range(iters): scheduler.step() # 参数设置 args.optimizer = optimizer args.loss_function = loss_function args.scheduler = scheduler args.train_loader = train_loader args.val_loader = val_loader if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume, map_location=None if use_gpu else 'cpu') model.load_state_dict(checkpoint, strict=True) validate(model, args, all_iters=all_iters, arch_loader=arch_loader) exit(0) # warmup weights if args.warmup > 0: logging.info("begin warmup weights") while all_iters < args.warmup: all_iters = train_supernet(model, args, bn_process=False, all_iters=all_iters) validate(model, args, all_iters=all_iters, arch_loader=arch_loader) while all_iters < args.total_iters: logging.info("=" * 50) all_iters = train_subnet(model, args, bn_process=False, all_iters=all_iters, arch_loader=arch_loader) if all_iters % 200 == 0 and args.local_rank == 0: logging.info("validate iter {}".format(all_iters)) validate(model, args, all_iters=all_iters, arch_loader=arch_loader)
freeze_model.load_state_dict(torch.load(pretrained_weights), strict=False) # #初始化优化器 fre_optimizer = torch.optim.Adam(freeze_model.parameters(), lr=warmup_lr, weight_decay=TRAIN["WEIGHT_DECAY"]) # #学习率调整策略:余弦退火 if freeze_lr == 'cosineAnn': fre_scheduler = CosineAnnealingLR(fre_optimizer, T_max=5, eta_min=0) elif freeze_lr == 'cosineAnnWarm': fre_scheduler = CosineAnnealingWarmRestarts(fre_optimizer, T_0=freeze_epochs, T_mult=1) elif freeze_lr == 'steplr': fre_scheduler = StepLR( fre_optimizer, step_size=(freeze_epochs * (len(frozen_dataloader) - 2)) // 2, gamma=0.1) for epoch in range(freeze_epochs): # mloss = torch.zeros(1).to(device) mloss = 0. val_loss = 0. freeze_model.train() start_time = time.time()
weight_decay=args.weight_decay) if args.find_lr: lr_finder = LRFinder(model, optimizer, criterion, device=device) lr_finder.range_test(trn_loader, start_lr=args.start_lr, end_lr=args.end_lr, num_iter=100, accumulation_steps=args.accum_iter) fig_name = 'lr_curve.png' lr_finder.plot(fig_name) lr_finder.reset() break scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=epochs, T_mult=1, eta_min=1e-6) scaler = GradScaler() for epoch in range(epochs): train_one_epoch(fold, epoch, model, criterion, optimizer, trn_loader, device, scheduler=scheduler) valid_one_epoch(fold, epoch, model, criterion,
class model(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(in_channels=3, out_channels=3, kernel_size=3) def forward(self, x): pass net = model() optimizer = torch.optim.Adam(net.parameters(), lr=initial_lr) # scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200) # scheduler = StepLR(optimizer, initial_lr, total_epoch) scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5) # scheduler = LambdaLR(optimizer, lambda step : (1.0-step/total_epoch), last_epoch=-1) # scheduler = CosineAnnealingWarmRestarts(optimizer,T_0=5,T_mult=2) # scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, # lambda step: (1.0-step/total_epoch) if step <= total_epoch else 0, last_epoch=-1) print("初始化的学习率:", optimizer.defaults['lr']) lr_list = [] # 把使用过的lr都保存下来,之后画出它的变化 for epoch in range(1, total_epoch): optimizer.zero_grad() optimizer.step() print("第%d个epoch的学习率:%f" % (epoch, optimizer.param_groups[0]['lr'])) print(scheduler.get_lr()) lr_list.append(optimizer.param_groups[0]['lr']) # lr_list.append(scheduler.get_last_lr()[0])
class WarmRestartsCustomScheduler(_LRScheduler): """Custom Learning Rate Scheduler based on the 3rd Place Solution. This is for setting the learning rate schedule: Warm Restarts for epochs (1-28) LR=1e-5 (29-32), LR=1e-6 (33-35) The general version looks like this: # from: # https://github.com/naivelamb/kaggle-cloud-organization/blob/master/main_seg.py if epoch < start_epoch + n_epochs - 1: if epoch != 0: scheduler.step() scheduler=warm_restart(scheduler, T_mult=2) elif (epoch < start_epoch + n_epochs + 2 and epoch >= start_epoch + n_epochs - 1): optimizer.param_groups[0]['lr'] = 1e-5 else: optimizer.param_groups[0]['lr'] = 5e-6 """ def __init__(self, optimizer, T_0, T_mult=2, eta_min=0, num_wr_epochs=28, mid_const_lr_epochs_range=[29, 32], constant_lrs=[1e-5, 5e-6], last_epoch=-1): """ Args: optimizer (torch.optim.Optimizer): T_0: T_mult: eta_min: num_wr_epochs (int): The number of warm restart epochs to do mid_const_lr_epochs_range (list-like[int]): [min, max] where max is not included. This is the epoch interval where the first lr of constant_lr is used constant_lrs (list-like[float]): the learning rates to use for the mid and end intervals after warm restarts ends. """ self.num_wr_epochs = num_wr_epochs assert len(mid_const_lr_epochs_range) == 2, \ "`constant_lrs` must be a list-like with length 2." self.mid_const_lr_epochs_range = mid_const_lr_epochs_range assert len(constant_lrs) == 2, \ "`constant_lrs` must be a list-like with length 2." self.constant_lrs = constant_lrs self.optimizer = optimizer self.warm_restarts = CosineAnnealingWarmRestarts( self.optimizer, T_0, T_mult, eta_min) super().__init__(optimizer, last_epoch=last_epoch) def get_lr(self): """No calculation done here. """ return self.get_last_lr() def step(self, epoch=None): """Computes a step for the learning rate scheduler. Here, a step is an epoch. This is where the learning rates are set and the last_epoch counter is updated. """ # warm restarts if self.last_epoch < self.num_wr_epochs + 1: self.warm_restarts.step() self.last_epoch = self.warm_restarts.last_epoch self._last_lr = self.warm_restarts.get_last_lr() # constant LR (first round) elif (self.last_epoch >= self.mid_const_lr_epochs_range[0] and self.last_epoch < self.mid_const_lr_epochs_range[1]): self.last_epoch += 1 for param_group in self.optimizer.param_groups: param_group['lr'] = self.constant_lrs[0] self._last_lr = [ group['lr'] for group in self.optimizer.param_groups ] # constant LR (second round) else: for param_group in self.optimizer.param_groups: param_group['lr'] = self.constant_lrs[1] self.last_epoch += 1 self._last_lr = [ group['lr'] for group in self.optimizer.param_groups ]
conv_lr, layer_lr, head_lr = lrs opt = torch.optim.AdamW( params=[ # {'params': model.conv1.parameters(), 'lr': conv_lr}, { 'params': model.layer4.parameters(), 'lr': layer_lr }, { 'params': model.last_linear.parameters(), 'lr': head_lr } ], weight_decay=0.01) sched = CosineAnnealingWarmRestarts(opt, T_0=len(loaders['train']), T_mult=2, eta_min=1e-6) loss_fn = nn.CrossEntropyLoss() runner = SupervisedRunner() runner.train(model=model, num_epochs=epochs, criterion=loss_fn, optimizer=opt, scheduler=sched, logdir='/tmp/cells_split/', loaders=loaders, callbacks=[ AccuracyCallback(num_classes=num_classes), BatchMetricsPlotCallback(use_env_creds=True), EpochMetricsPlotCallback(use_env_creds=True)
def cos_lr_scheduler(optimizer, t_mult=5): return CosineAnnealingWarmRestarts(optimizer, t_mult)