def __init__(self, config, num_classes, train_triplet=False): """ :param config: 配置参数 :param num_classes: 训练集的类别数;类型为int :param train_triplet: 是否只训练triplet损失;类型为bool """ self.num_classes = num_classes self.model_name = config.model_name self.last_stride = config.last_stride self.num_gpus = torch.cuda.device_count() print('Using {} GPUS'.format(self.num_gpus)) print('NUM_CLASS: {}'.format(self.num_classes)) print('USE LOSS: {}'.format(config.selected_loss)) # 加载模型,只要有GPU,则使用DataParallel函数,当GPU有多个GPU时,调用sync_bn函数 self.model = get_model(self.model_name, self.num_classes, self.last_stride) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) if self.num_gpus > 1: self.model = convert_model(self.model) self.model = self.model.cuda() # 加载超参数 self.epoch = config.epoch # 实例化实现各种子函数的 solver 类 self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # 加载损失函数 self.criterion = Loss(self.model_name, config.selected_loss, config.margin, self.num_classes) # 加载优化函数 self.optim = get_optimizer(config, self.model) # 加载学习率衰减策略 self.scheduler = get_scheduler(config, self.optim) # 创建保存权重的路径 self.model_path = os.path.join(config.save_path, config.model_name) if not os.path.exists(self.model_path): os.makedirs(self.model_path) # 如果只训练Triplet损失 if train_triplet: self.solver.load_checkpoint(os.path.join(self.model_path, '{}.pth'.format(self.model_name))) # 保存json文件和初始化tensorboard TIMESTAMP = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.datetime.now()) self.writer = SummaryWriter(log_dir=os.path.join(self.model_path, TIMESTAMP)) with codecs.open(self.model_path + '/' + TIMESTAMP + '.json', 'w', "utf-8") as json_file: json.dump({k: v for k, v in config._get_kwargs()}, json_file, ensure_ascii=False) # 设置随机种子,注意交叉验证部分划分训练集和验证集的时候,要保持种子固定 self.seed = int(time.time()) seed_torch(self.seed) with open(self.model_path + '/' + TIMESTAMP + '.pkl', 'wb') as f: pickle.dump({'seed': self.seed}, f, -1)
def prepare(config, train_labels_number): """ Args: config: 配置参数 train_labels_number: list, 某一折的[number_class0, number__class1, ...] Returns: optimizer: 优化器 model: 模型 criterion: 损失函数 """ # 加载模型 prepare_model = PrepareModel() model = prepare_model.create_model( model_type=config.model_type, classes_num=config.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn ) if torch.cuda.is_available(): model = torch.nn.DataParallel(model) model = model.cuda() # 加载优化器 optimizer = prepare_model.create_optimizer(config.model_type, model, config) # 加载损失函数 criterion = Loss(config.model_type, config.loss_name, config.num_classes, train_labels_number, config.beta_CB, config.gamma) return optimizer, model, criterion
def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_local_attention_model( model_type=config.model_type, classes_num=self.num_classes, last_stride=2, droprate=0) # 得到最新产生的权重文件 weight_path = os.path.join('checkpoints', config.model_type) lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort( key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') # 加载之前训练的权重 pretrained_dict = torch.load(weight_path)['state_dict'] model_dict = self.model.state_dict() pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict } # filter out unnecessary keys model_dict.update(pretrained_dict) self.model.load_state_dict(model_dict) print('Successfully Loaded from %s' % weight_path) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0
def __init__(self, config, fold): """ Args: config: 配置参数 fold: 当前为第几折 """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 10 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.auto_aug = config.auto_aug # 多尺度 self.image_size = config.image_size self.multi_scale = config.multi_scale self.val_multi_scale = config.val_multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval # 稀疏训练 self.sparsity = config.sparsity self.sparsity_scale = config.sparsity_scale self.penalty_type = config.penalty_type self.selected_labels = config.selected_labels if self.auto_aug: print('@ Using AutoAugment.') if self.cut_mix: print('@ Using cut mix.') if self.multi_scale: print('@ Using multi scale training.') print('@ Using LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if config.weight_path: self.model = prepare_model.load_chekpoint(self.model, config.weight_path) # 稀疏训练 self.sparsity_train = None if config.sparsity: print('@ Using sparsity training.') self.sparsity_train = Sparsity(self.model, sparsity_scale=self.sparsity_scale, penalty_type=self.penalty_type) # l1正则化 self.l1_regular = config.l1_regular self.l1_decay = config.l1_decay if self.l1_regular: print('@ Using l1_regular') self.l1_reg_loss = Regularization(self.model, weight_decay=self.l1_decay, p=1) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step, warmup=config.warmup, multiplier=config.multiplier, warmup_epoch=config.warmup_epoch, delay_epoch=config.delay_epoch) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.save_path, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric( self.class_names, self.model_path) self.max_accuracy_valid = 0
def __init__(self, config, fold, train_labels_number): """ Args: config: 配置参数 fold: int, 当前为第几折 train_labels_number: list, 某一折的[number_class0, number__class1, ...] """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.save_interval = 100 self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.image_size = config.image_size self.multi_scale = config.multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval if self.cut_mix: print('Using cut mix.') if self.multi_scale: print('Using multi scale training.') print('USE LOSS: {}'.format(config.loss_name)) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) if torch.cuda.is_available(): self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes, train_labels_number, config.beta_CB, config.gamma) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) if config.restore: weight_path = os.path.join('checkpoints', config.model_type) if config.restore == 'last': lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort(key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 按照最近修改时间排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') else: weight_path = os.path.join(weight_path, config.restore, 'model_best.pth') self.solver.load_checkpoint(weight_path) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.train_url, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open("online-service/model/label_id_name.json", 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric(self.class_names, self.model_path, text_flag=0) self.max_accuracy_valid = 0
def __init__(self, config, fold, train_labels_number): """ Args: config: 配置参数 fold: int, 当前为第几折 train_labels_number: list, 某一折的[number_class0, number__class1, ...] """ self.config = config self.fold = fold self.epoch = config.epoch self.num_classes = config.num_classes self.lr_scheduler = config.lr_scheduler self.cut_mix = config.cut_mix self.beta = config.beta self.cutmix_prob = config.cutmix_prob self.train_url = config.train_url self.bucket_name = config.bucket_name self.image_size = config.image_size self.multi_scale = config.multi_scale self.multi_scale_size = config.multi_scale_size self.multi_scale_interval = config.multi_scale_interval if self.cut_mix: print('Using cut mix.') if self.multi_scale: print('Using multi scale training.') print('USE LOSS: {}'.format(config.loss_name)) # 拷贝预训练权重 print("=> using pre-trained model '{}'".format(config.model_type)) if not mox.file.exists( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ): mox.file.copy( os.path.join(self.bucket_name, 'model_zoo/se_resnext101_32x4d-3b2fe3d8.pth'), '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ) print( 'copy pre-trained model from OBS to: %s success' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ))) else: print('use exist pre-trained model at: %s' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/se_resnext101_32x4d-3b2fe3d8.pth' ))) # 拷贝预训练权重 print("=> using pre-trained model '{}'".format(config.model_type)) if not mox.file.exists( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ): mox.file.copy( os.path.join(self.bucket_name, 'model_zoo/efficientnet-b5-b6417697.pth'), '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ) print( 'copy pre-trained model from OBS to: %s success' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ))) else: print('use exist pre-trained model at: %s' % (os.path.abspath( '/home/work/.cache/torch/checkpoints/efficientnet-b5-b6417697.pth' ))) # 加载模型 prepare_model = PrepareModel() self.model = prepare_model.create_model(model_type=config.model_type, classes_num=self.num_classes, drop_rate=config.drop_rate, pretrained=True, bn_to_gn=config.bn_to_gn) self.model = torch.nn.DataParallel(self.model).cuda() # 加载优化器 self.optimizer = prepare_model.create_optimizer( config.model_type, self.model, config) # 加载衰减策略 self.exp_lr_scheduler = prepare_model.create_lr_scheduler( self.lr_scheduler, self.optimizer, step_size=config.lr_step_size, restart_step=config.restart_step, multi_step=config.multi_step) # 加载损失函数 self.criterion = Loss(config.model_type, config.loss_name, self.num_classes, train_labels_number, config.beta_CB, config.gamma) # 实例化实现各种子函数的 solver 类 self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.solver = Solver(self.model, self.device) if config.restore: weight_path = os.path.join('checkpoints', config.model_type) if config.restore == 'last': lists = os.listdir(weight_path) # 获得文件夹内所有文件 lists.sort(key=lambda fn: os.path.getmtime(weight_path + '/' + fn)) # 按照最近修改时间排序 weight_path = os.path.join(weight_path, lists[-1], 'model_best.pth') else: weight_path = os.path.join(weight_path, config.restore, 'model_best.pth') self.solver.load_checkpoint(weight_path) # log初始化 self.writer, self.time_stamp = self.init_log() self.model_path = os.path.join(self.config.train_local, self.config.model_type, self.time_stamp) # 初始化分类度量准则类 with open(config.local_data_root + 'label_id_name.json', 'r', encoding='utf-8') as json_file: self.class_names = list(json.load(json_file).values()) self.classification_metric = ClassificationMetric(self.class_names, self.model_path, text_flag=0) self.max_accuracy_valid = 0