def create(self, model: Model) -> "Optimizer": no_decay = ["bias", "LayerNorm.weight"] parameters = [{ "params": [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], "weight_decay": self.weight_decay, }, { "params": [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], "weight_decay": 0.0, }] optimizer = AdamW( parameters, betas=(0.9, 0.98), # RoBERTa paper 参数 lr=self.lr, eps=self.eps) return optimizer
def create(self, model: Model) -> "BilstmGATOptimizerFactory": if self.optimizer_name == "SGD": optimizer = SGD(params=model.parameters(), lr=self.lr, weight_decay=self.weight_decay) elif self.optimizer_name == "Adam": optimizer = Adam(params=model.parameters(), lr=self.lr, weight_decay=self.weight_decay) else: raise RuntimeError(f"optimizer_name 必须是 Adam 或 SGD") return optimizer
def create(self, model: Model) -> "Optimizer": """ 创建 optimizer :param model: 模型 """ sentence_embedding_param_name = "_sentence_embedder.weight" parameter_dict: Dict[str, torch.nn.Parameter] = \ {name: parameter for name, parameter in model.named_parameters()} sentence_embedding_param = parameter_dict.pop( sentence_embedding_param_name) if self._is_fine_tuning: # 设置 requires_grad = True sentence_embedding_param.requires_grad = True # 分组设置 params 对于 微调的参数 设置 lr 要小一些 params = [{ "params": parameter_dict.values() }, { "params": [sentence_embedding_param], "lr": 1e-3 }] optimizer = SGD(params=params, lr=0.01) else: # 将不需要 fine tuning 参数设置成 不需要梯度更新 # 同时也不需要将这个参数放在 optimizer 中 sentence_embedding_param.requires_grad = False optimizer = SGD(params=parameter_dict.values(), lr=0.01) return optimizer
def __init__(self, serialize_dir: str, num_epoch: int, model: Model, loss: Loss, metrics: ModelMetricAdapter, optimizer_factory: OptimizerFactory, lr_scheduler_factory: LRSchedulerFactory = None, patient: int = None, num_check_point_keep: int = None, cuda_devices: List[str] = None): """ 训练器初始化 :param num_epoch: 训练的 epoch 数量 :param model: 要训练的模型 :param loss: 模型的 loss function :param metrics: 模型的指标计算 :param optimizer_factory: 模型的优化器的创建工厂。为什么不直接使用优化器?是因为, 优化器的创建依赖于 model, 所以 这里传递的参数 optimizer factory, 避免使用者在 trainer 外面生成 optimizer, 导致在 trainer 内 optimizer 依赖于 model 的参数产生问题。典型问题是: 设置 cuda. :param serialize_dir: 训练存储的文件路径 :param patient: early stopping 的 patient. 如果是 `None`, 将不会进行 early stopping; 否则, 当前训练的指标超出了 patient 个 epoch 将会 early stopping. :param num_check_point_keep: checkpoint 保留的数量。如果是 `None` 则全部保留; 否则,保留 num_check_point_keep 个checkpoint. :param cuda_devices: cuda device列表. 字符串类型,那么就是 "cuda:0" 这种格式。 """ if cuda_devices is not None and len(cuda_devices) != 1: raise RuntimeError( f"目前仅仅支持单卡训练, 设置的 cuda devices 是 {cuda_devices}") if cuda_devices is not None: self._cuda_devices = [ torch.device(device) for device in cuda_devices ] self._model = model.cuda(self._cuda_devices[0]) else: self._cuda_devices = None self._model = model self._loss = loss self._metrics = metrics self._optimizer = optimizer_factory.create(model=self._model) if lr_scheduler_factory is not None: self._lr_scheduler = lr_scheduler_factory.create( optimizer=self.optimizer, model=self.model) else: self._lr_scheduler = None self._serialize_dir = serialize_dir self._metric_tracker = MetricTracker(patient=patient) self._num_check_point_keep = num_check_point_keep self._num_epoch = num_epoch self._current_epoch: int = None
def create(self, model: Model) -> "Optimizer": return Adam(params=model.parameters(), lr=0.01)
def create(self, model: Model) -> "Optimizer": return torch.optim.Adam(params=model.parameters(), lr=1e-1)
def create(self, model: Model) -> "RnnWithCrfOptimizerFactory": optimizer = Adam(params=model.parameters(), lr=0.01) return optimizer
def create(self, model: Model) -> "LatticeOptimizerFactory": optimizer = SGD(params=model.parameters(), lr=self.lr, momentum=self.momentum) return optimizer