def build(config, main_prog, startup_prog, class_num=None, step_each_epoch=100, is_train=True, is_distributed=True): """ Build a program using a model and an optimizer 1. create feeds 2. create a dataloader 3. create a model 4. create fetchs 5. create an optimizer Args: config(dict): config main_prog(): main program startup_prog(): startup program class_num(int): the class number of network, required if use_mix is_train(bool): train or eval is_distributed(bool): whether to use distributed training method Returns: dataloader(): a bridge between the model and the data fetchs(dict): dict of model outputs(included loss and measures) """ with paddle.static.program_guard(main_prog, startup_prog): with paddle.utils.unique_name.guard(): mode = "Train" if is_train else "Eval" use_mix = "batch_transform_ops" in config["DataLoader"][mode][ "dataset"] feeds = create_feeds(config["Global"]["image_shape"], use_mix, class_num=class_num, dtype="float32") # build model # data_format should be assigned in arch-dict input_image_channel = config["Global"]["image_shape"][ 0] # default as [3, 224, 224] model = build_model(config) out = model(feeds["data"]) # end of build model fetchs = create_fetchs(out, feeds, config["Arch"], epsilon=config.get('ls_epsilon'), class_num=class_num, use_mix=use_mix, config=config, mode=mode) lr_scheduler = None optimizer = None if is_train: optimizer, lr_scheduler = build_optimizer( config["Optimizer"], config["Global"]["epochs"], step_each_epoch) optimizer = mixed_precision_optimizer(config, optimizer) if is_distributed: optimizer = dist_optimizer(config, optimizer) optimizer.minimize(fetchs['loss'][0]) return fetchs, lr_scheduler, feeds, optimizer
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"] or self.config["Arch"].get( "is_rec", False): self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global'][ 'use_visualdl'] and mode == "train" and dist.get_rank() == 0: vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in [ "cpu", "gpu", "xpu", "npu", "mlu" ] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config and self.mode == "train" else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_max_inplace_grad_add': 8, } if paddle.is_compiled_with_cuda(): AMP_RELATED_FLAGS_SETTING.update( {'FLAGS_cudnn_batchnorm_spatial_persistent': 1}) paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) if "class_num" in config["Global"]: global_class_num = config["Global"]["class_num"] if "class_num" not in config["Arch"]: config["Arch"]["class_num"] = global_class_num msg = f"The Global.class_num will be deprecated. Please use Arch.class_num instead. Arch.class_num has been set to {global_class_num}." else: msg = "The Global.class_num will be deprecated. Please use Arch.class_num instead. The Global.class_num has been ignored." logger.warning(msg) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: if hasattr( self.train_dataloader, "collate_fn" ) and self.train_dataloader.collate_fn is not None: for m_idx, m in enumerate(metric_config): if "TopkAcc" in m: msg = f"'TopkAcc' metric can not be used when setting 'batch_transform_ops' in config. The 'TopkAcc' metric has been removed." logger.warning(msg) break metric_config.pop(m_idx) self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for amp training if self.amp: self.scaler = paddle.amp.GradScaler( init_loss_scaling=self.scale_loss, use_dynamic_loss_scaling=self.use_dynamic_loss_scaling) amp_level = self.config['AMP'].get("level", "O1") if amp_level not in ["O1", "O2"]: msg = "[Parameter Error]: The optimize level of AMP only support 'O1' and 'O2'. The level has been set 'O1'." logger.warning(msg) self.config['AMP']["level"] = "O1" amp_level = "O1" self.model, self.optimizer = paddle.amp.decorate( models=self.model, optimizers=self.optimizer, level=amp_level, save_dtype='float32') # for distributed world_size = dist.get_world_size() self.config["Global"]["distributed"] = world_size != 1 if world_size != 4 and self.mode == "train": msg = f"The training strategy in config files provided by PaddleClas is based on 4 gpus. But the number of gpus is {world_size} in current training. Please modify the stategy (learning rate, batch size and so on) if use config files in PaddleClas to train." logger.warning(msg) if self.config["Global"]["distributed"]: dist.init_parallel_env() self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])
def create_optimizer(config, step_each_epoch): # create learning_rate instance optimizer, lr_sch = build_optimizer(config["Optimizer"], config["Global"]["epochs"], step_each_epoch) return optimizer, lr_sch
def __init__(self, config, mode="train"): assert mode in ["train", "eval", "infer", "export"] self.mode = mode self.config = config self.eval_mode = self.config["Global"].get("eval_mode", "classification") if "Head" in self.config["Arch"]: self.is_rec = True else: self.is_rec = False # set seed seed = self.config["Global"].get("seed", False) if seed or seed == 0: assert isinstance(seed, int), "The 'seed' must be a integer!" paddle.seed(seed) np.random.seed(seed) random.seed(seed) # init logger self.output_dir = self.config['Global']['output_dir'] log_file = os.path.join(self.output_dir, self.config["Arch"]["name"], f"{mode}.log") init_logger(name='root', log_file=log_file) print_config(config) # init train_func and eval_func assert self.eval_mode in ["classification", "retrieval"], logger.error( "Invalid eval mode: {}".format(self.eval_mode)) self.train_epoch_func = train_epoch self.eval_func = getattr(evaluation, self.eval_mode + "_eval") self.use_dali = self.config['Global'].get("use_dali", False) # for visualdl self.vdl_writer = None if self.config['Global']['use_visualdl'] and mode == "train": vdl_writer_path = os.path.join(self.output_dir, "vdl") if not os.path.exists(vdl_writer_path): os.makedirs(vdl_writer_path) self.vdl_writer = LogWriter(logdir=vdl_writer_path) # set device assert self.config["Global"]["device"] in ["cpu", "gpu", "xpu", "npu"] self.device = paddle.set_device(self.config["Global"]["device"]) logger.info('train with paddle {} and device {}'.format( paddle.__version__, self.device)) # AMP training self.amp = True if "AMP" in self.config else False if self.amp and self.config["AMP"] is not None: self.scale_loss = self.config["AMP"].get("scale_loss", 1.0) self.use_dynamic_loss_scaling = self.config["AMP"].get( "use_dynamic_loss_scaling", False) else: self.scale_loss = 1.0 self.use_dynamic_loss_scaling = False if self.amp: AMP_RELATED_FLAGS_SETTING = { 'FLAGS_cudnn_batchnorm_spatial_persistent': 1, 'FLAGS_max_inplace_grad_add': 8, } paddle.fluid.set_flags(AMP_RELATED_FLAGS_SETTING) #TODO(gaotingquan): support rec class_num = config["Arch"].get("class_num", None) self.config["DataLoader"].update({"class_num": class_num}) # build dataloader if self.mode == 'train': self.train_dataloader = build_dataloader(self.config["DataLoader"], "Train", self.device, self.use_dali) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): if self.eval_mode == "classification": self.eval_dataloader = build_dataloader( self.config["DataLoader"], "Eval", self.device, self.use_dali) elif self.eval_mode == "retrieval": self.gallery_query_dataloader = None if len(self.config["DataLoader"]["Eval"].keys()) == 1: key = list(self.config["DataLoader"]["Eval"].keys())[0] self.gallery_query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], key, self.device, self.use_dali) else: self.gallery_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Gallery", self.device, self.use_dali) self.query_dataloader = build_dataloader( self.config["DataLoader"]["Eval"], "Query", self.device, self.use_dali) # build loss if self.mode == "train": loss_info = self.config["Loss"]["Train"] self.train_loss_func = build_loss(loss_info) if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): loss_config = self.config.get("Loss", None) if loss_config is not None: loss_config = loss_config.get("Eval") if loss_config is not None: self.eval_loss_func = build_loss(loss_config) else: self.eval_loss_func = None else: self.eval_loss_func = None # build metric if self.mode == 'train': metric_config = self.config.get("Metric") if metric_config is not None: metric_config = metric_config.get("Train") if metric_config is not None: self.train_metric_func = build_metrics(metric_config) else: self.train_metric_func = None else: self.train_metric_func = None if self.mode == "eval" or (self.mode == "train" and self.config["Global"]["eval_during_train"]): metric_config = self.config.get("Metric") if self.eval_mode == "classification": if metric_config is not None: metric_config = metric_config.get("Eval") if metric_config is not None: self.eval_metric_func = build_metrics(metric_config) elif self.eval_mode == "retrieval": if metric_config is None: metric_config = [{"name": "Recallk", "topk": (1, 5)}] else: metric_config = metric_config["Eval"] self.eval_metric_func = build_metrics(metric_config) else: self.eval_metric_func = None # build model self.model = build_model(self.config["Arch"]) # set @to_static for benchmark, skip this by default. apply_to_static(self.config, self.model) # for slim self.pruner = get_pruner(self.config, self.model) self.quanter = get_quaner(self.config, self.model) # load_pretrain if self.config["Global"]["pretrained_model"] is not None: if self.config["Global"]["pretrained_model"].startswith("http"): load_dygraph_pretrain_from_url( self.model, self.config["Global"]["pretrained_model"]) else: load_dygraph_pretrain( self.model, self.config["Global"]["pretrained_model"]) # build optimizer if self.mode == 'train': self.optimizer, self.lr_sch = build_optimizer( self.config["Optimizer"], self.config["Global"]["epochs"], len(self.train_dataloader), [self.model]) # for distributed self.config["Global"][ "distributed"] = paddle.distributed.get_world_size() != 1 if self.config["Global"]["distributed"]: dist.init_parallel_env() if self.config["Global"]["distributed"]: self.model = paddle.DataParallel(self.model) # build postprocess for infer if self.mode == 'infer': self.preprocess_func = create_operators( self.config["Infer"]["transforms"]) self.postprocess_func = build_postprocess( self.config["Infer"]["PostProcess"])