def __init__( self, model: Model, train_dataset: List[Instance], iterator: DataIterator, subtrainer_params: Params, cross_validation_splitter: CrossValidationSplitter, serialization_dir: str, group_key: Optional[str] = None, leave_model_trained: bool = False, validation_dataset: Optional[List[Instance]] = None, recover: bool = False ) -> None: # FIXME: does recover make sense? Maybe to continue the CV. # To use the same device as the subtrainers, in case `self._cuda_devices` is queried. cuda_device = parse_cuda_device( subtrainer_params.get('cuda_device', -1)) super().__init__(serialization_dir, cuda_device=cuda_device) self.model = model self.train_dataset = train_dataset self.iterator = iterator self.subtrainer_params = subtrainer_params self.cross_validation_splitter = cross_validation_splitter self.group_key = group_key self.leave_model_trained = leave_model_trained self.validation_dataset = validation_dataset self.recover = recover
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None, ) -> "CallbackTrainer": pieces = TrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) model = pieces.model params = pieces.params validation_iterator = pieces.validation_iterator or pieces.iterator shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) callbacks_params = params.pop("callbacks", []) callbacks: List[Callback] = [ Callback.from_params( params=callback_params, model=model, optimizer=optimizer, instances=pieces.train_dataset, iterator=pieces.iterator, shuffle=shuffle, validation_data=pieces.validation_dataset, validation_iterator=validation_iterator, serialization_dir=serialization_dir, ) for callback_params in callbacks_params ] params.assert_empty(cls.__name__) return cls( model, pieces.train_dataset, pieces.iterator, optimizer, num_epochs=num_epochs, shuffle=shuffle, serialization_dir=serialization_dir, cuda_device=cuda_device, callbacks=callbacks, )
def from_params(cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'Trainer': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params(params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls(model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average)
def from_params(cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'PtDistTrainer': all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) vocab = Vocabulary.from_files(params.vocabulary.directory_path) model = Model.from_params(vocab=vocab, params=params.pop('model')) model.extend_embedder_vocab() if is_master_rank(): vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) train_data = all_datasets['train'] validation_data = all_datasets.get('validation') batch_size = params.iterator.batch_size trainer_params = params.pop("trainer") keys = [key for key in params] for key in keys: params.pop(key) params = trainer_params validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) pretrain_file = params.pop("pretrain_file", None) no_grad_regexes = params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) model = model.cuda(dist.get_rank()) if pretrain_file: model_state = torch.load(pretrain_file, map_location=nn_util.device_mapping( dist.get_rank())) model.load_state_dict(model_state) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # print([n for n, p in model.named_parameters() if p.requires_grad]) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=None) return cls(model, optimizer, train_data, validation_data, batch_size=batch_size, validation_metric=validation_metric, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, checkpointer=checkpointer)
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, local_rank: int = 0, ) -> "Trainer": from allennlp.training.trainer import Trainer from allennlp.training.trainer_pieces import TrainerPieces config = dict(as_flat_dict(params.as_dict())) pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model serialization_dir = serialization_dir iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset params = pieces.params validation_iterator = pieces.validation_iterator patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters ) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params(optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params(optimizer, momentum_scheduler_params) else: momentum_scheduler = None if "checkpointer" in params: if ( "keep_serialized_model_every_num_seconds" in params or "num_serialized_models_to_keep" in params ): raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods." ) checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int("num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None ) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds=keep_serialized_model_every_num_seconds, ) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool("should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) distributed = params.pop_bool("distributed", False) world_size = params.pop_int("world_size", 1) num_gradient_accumulation_steps = params.pop("num_gradient_accumulation_steps", 1) lang_mean_dir = params.pop("ft_lang_mean_dir", None) if lang_mean_dir: try: assert model._lang_means is not None lang_mean = get_lang_mean(lang_mean_dir) model.add_ft_lang_mean_to_lang_means(lang_mean) except (AttributeError, AssertionError) as e: pass writer = None wandb_config = params.pop("wandb", None) if wandb_config is not None: writer = WandBWriter(config, model, wandb_config) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, distributed=distributed, local_rank=local_rank, world_size=world_size, num_gradient_accumulation_steps=num_gradient_accumulation_steps, writer=writer, )
def common_init( self, encoder_output_dim: int, decoder: DecoderNet, decoder_type: str, decoder_num_layers: int, share_decoder_params: bool, start_token: str = "[CLS]", end_token: str = "[SEP]", index_name: str = "bert", beam_size: int = 4, min_dec_len: int = 4, max_dec_len: int = 30, coverage_factor: float = 0.0, device: Union[int, str, List[int]] = -1, metrics: Optional[List[Metric]] = None, valid_metric_keys: List[str] = None, seed: int = 42, initializer: InitializerApplicator = InitializerApplicator()): """几个不同模型通用的初始化过程""" seed_everything(seed) # 初始化随机种子 # ----------- metrics相关初始化 ------------- # 定义metrics self._metrics = [TokenBasedBLEU(), TokenBasedROUGE()] if metrics is not None: self._metrics = metrics self._rewrite_em = RewriteEM() self._restore_score = RestorationScore(compute_restore_tokens=True) self._cov_loss_value = Average() self.valid_metric_keys = valid_metric_keys # ----------- 参数相关初始化 ------------- # 定义token以及其他参数 self._start_token = start_token self._end_token = end_token self._index_name = index_name # 使用bert模型,本质上还是要事先读取词表 # 所以需要将对应的vocabulary的namespace进行修改 # 这里非常重要,如果namespace不对,很容易出现assert_trigger_error if "bert" in self._index_name: self._vocab_namespace = "tokens" else: self._vocab_namespace = self._index_name self.coverage_factor = coverage_factor self.decoder_num_layers = decoder_num_layers decoder_type = decoder_type.lower() # 保存一些重要的参数 self.params = Params( params={ "beam_size": beam_size, "min_dec_len": min_dec_len, "max_dec_len": max_dec_len, "decoder_type": decoder_type }) # ----------- device相关初始化 ------------- device = parse_cuda_device(device) check_for_gpu(device) # 检查gpu设置是否超过可用范围 if isinstance(device, list): device = device[0] if device < 0: self._device = torch.device("cpu") else: self._device = torch.device(f"cuda:{device}") # ----------- decoder相关初始化 ------------- # 定义decoder self.decoder = decoder self._share_decoder_params = share_decoder_params # 如果解码器是lstm,需要判断是否使用coverage机制 # transformer使用coverage机制比较麻烦,所以直接使用内部计算出来的attention分布 if self.params['decoder_type'] == 'lstm': # 用于LSTM解码器 if self.coverage_factor > 0.0: # 定义用于计算decoder中的每一个step对应encoder结果的attention层 # 以及计算对于当前轮和历史轮的attention分布的权重 self.attention = BilinearAttention( vector_dim=encoder_output_dim, matrix_dim=encoder_output_dim + 1, activation=Activation.by_name('linear')()) self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3 + 2, 2) else: self.attention = BilinearAttention( vector_dim=encoder_output_dim, matrix_dim=encoder_output_dim, activation=Activation.by_name('linear')()) self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2) else: # 用于Transformer解码器 self.lamb_linear = torch.nn.Linear(encoder_output_dim * 3, 2) # ----------- 词表相关初始化 ------------- self._vocab_size = self.vocab.get_vocab_size( namespace=self._vocab_namespace) self._unk_id = self.vocab.get_token_index( self.vocab._oov_token, namespace=self._vocab_namespace) # ----------- 初始化模型参数 ------------- self._initializer = initializer self._initializer(self.lamb_linear) self._initializer(self.decoder)
def from_params( cls, # type: ignore model: Model, serialization_dir: str, files_to_archive: Dict[str, str], iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> 'TrainerFP16': # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) fp16 = params.pop_bool("fp16", False) dynamic_loss_scale = params.pop_bool("dynamic_loss_scale", True) validate_first = params.pop_bool("validate_first", False) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if fp16: model.half() if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] # If fp16, need to wrap the optimizer try: from apex.optimizers import FusedAdam except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if fp16: # The FP16_Optimizer we use depends on whether the optimizer is FusedAdam or a regular pytorch optimizer if isinstance(optimizer, FusedAdam): from apex.optimizers import FP16_Optimizer else: from apex.fp16_utils import FP16_Optimizer optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=dynamic_loss_scale) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) statistics_interval = params.pop_int("statistics_interval", 5000) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, statistics_interval=statistics_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, fp16=fp16, validate_first=validate_first, files_to_archive=files_to_archive)
def _from_params( cls, # type: ignore model: Model, serialization_dir: str, iterator: DataIterator, train_data: Iterable[Instance], validation_data: Optional[Iterable[Instance]], params: Params, validation_iterator: DataIterator = None) -> DecompTrainer: # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) validation_data_path = params.pop("validation_data_path", None) validation_prediction_path = params.pop("validation_prediction_path", None) semantics_only = params.pop("semantics_only", False) drop_syntax = params.pop("drop_syntax", True) include_attribute_scores = params.pop("include_attribute_scores", False) warmup_epochs = params.pop("warmup_epochs", 0) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) bert_optim_params = params.pop("bert_optimizer", None) bert_name = "_bert_encoder" if bert_optim_params is not None: tune_after_layer_num = params.pop("bert_tune_layer", 12) frozen_regex_str = [ "(_bert_encoder\.bert_model\.embeddings.*)", "(_bert_encoder\.bert_model\.pooler.*)" ] tune_regex_str = [] for i in range(0, 12): # match all numbers greater than layer num via disjunction tune_regex_one = f"({bert_name}\.bert_model\.encoder\.layer\.{i}\..*)" if i >= tune_after_layer_num: tune_regex_str.append(tune_regex_one) else: frozen_regex_str.append(tune_regex_one) tune_regex = re.compile("|".join(tune_regex_str)) frozen_regex = re.compile("|".join(frozen_regex_str)) # decide which params require grad for which optimizer all_names = [n for n, p in model.named_parameters()] tune_bert_names = [ n for n in all_names if tune_regex.match(n) is not None ] frozen_names = [ n for n in all_names if frozen_regex.match(n) is not None ] # assert that they're disjoint assert (len(set(frozen_names) & set(tune_bert_names)) == 0) # set tunable params to require gradient, frozen ones to not require for i, (n, p) in enumerate(model.named_parameters()): if n in frozen_names: p.requires_grad = False else: p.requires_grad = True # extract BERT bert_params = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n in tune_bert_names] # make sure this matches the tuneable bert params assert ([x[0] for x in bert_params] == tune_bert_names) bert_optimizer = Optimizer.from_params(bert_params, bert_optim_params) else: # freeze all BERT params tune_bert_names = [] bert_optimizer = None for i, (n, p) in enumerate(model.named_parameters()): if "_bert_encoder" in n: p.requires_grad = False # model params parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad and n not in tune_bert_names] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) syntactic_method = params.pop("syntactic_method", None) accumulate_batches = params.pop("accumulate_batches", 1) params.assert_empty(cls.__name__) return cls(model=model, optimizer=optimizer, bert_optimizer=bert_optimizer, iterator=iterator, train_dataset=train_data, validation_dataset=validation_data, validation_data_path=validation_data_path, validation_prediction_path=validation_prediction_path, semantics_only=semantics_only, warmup_epochs=warmup_epochs, syntactic_method=syntactic_method, drop_syntax=drop_syntax, include_attribute_scores=include_attribute_scores, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, accumulate_batches=accumulate_batches)
def from_params( # type: ignore cls, params: Params, serialization_dir: str, recover: bool = False, ) -> "CallbackTrainer": pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model params = pieces.params validation_iterator = pieces.validation_iterator or pieces.iterator shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) check_for_gpu(cuda_device) if cuda_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(cuda_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) callbacks_params = params.pop("callbacks", []) callbacks: List[Callback] = [ Callback.from_params( params=callback_params, model=model, optimizer=optimizer, instances=pieces.train_dataset, iterator=pieces.iterator, shuffle=shuffle, validation_data=pieces.validation_dataset, validation_iterator=validation_iterator, serialization_dir=serialization_dir, ) for callback_params in callbacks_params ] distributed = params.pop_bool("distributed", False) world_size = params.pop_int("world_size", 1) if distributed: rank = cuda_device else: rank = 0 params.assert_empty(cls.__name__) return cls( model, pieces.train_dataset, pieces.iterator, optimizer, num_epochs=num_epochs, shuffle=shuffle, serialization_dir=serialization_dir, cuda_device=cuda_device, callbacks=callbacks, distributed=distributed, rank=rank, world_size=world_size, )
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'Trainer': # datasets = meta_dataset_from_params(params, cache_directory=cache_directory, cache_prefix=cache_prefix) # model = Model.from_params(vocab=vocab, params=params.pop("model")) # iterator = DataIterator.from_params(params.pop("iterator")) # iterator.index_with(model.vocab) pieces = MetaTrainerPieces.from_params(params, serialization_dir, recover, cache_directory, cache_prefix) model = pieces.model iterator = pieces.iterator, # params=pieces.params, train_data = pieces.train_dataset validation_data = pieces.validation_dataset validation_iterator = pieces.validation_iterator params = pieces.params # pylint: disable=arguments-differ patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", [0, 1])) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) print('[info] cuda_device in metatrainer.from_param is:{}'.format( cuda_device)) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, # distributed=distributed, # rank=local_rank, # world_size=world_size, # num_gradient_accumulation_steps=num_gradient_accumulation_steps, )
def from_params( cls, # type: ignore params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'Trainer': # pylint: disable=arguments-differ # We have to call TrainerPieces.from_params since we are using our own Trainer pieces = TrainerPieces.from_params(params, serialization_dir, recover) model = pieces.model serialization_dir = serialization_dir iterator = pieces.iterator train_data = pieces.train_dataset validation_data = pieces.validation_dataset validation_iterator = pieces.validation_iterator params = pieces.params patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) accumulation_steps = params.pop("accumulation_steps", 0) opt_level = params.pop("opt_level", "O1") num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) half_precision = params.pop("half_precision", False) warmup_proportion = params.pop("warmup_proportion", None) pretrained_model = params.pop("pretrained_model", None) if pretrained_model: logger.info('Loading pretrained model from', pretrained_model) model = load_archive(pretrained_model).model model._discriminative_loss_weight = 1 # TODO: fix this hack if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) params.assert_empty(cls.__name__) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, shuffle=shuffle, accumulation_steps=accumulation_steps, opt_level=opt_level, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, half_precision=half_precision, warmup_proportion=warmup_proportion)
def from_params(cls, params: Params, serialization_dir: str, recover: bool = False, cache_directory: str = None, cache_prefix: str = None) -> 'PtTrainer': max_src_len = params.dataset_reader.get('max_src_len', None) all_datasets = training_util.datasets_from_params( params, cache_directory, cache_prefix) datasets_for_vocab_creation = set( params.pop("datasets_for_vocab_creation", all_datasets)) for dataset in datasets_for_vocab_creation: if dataset not in all_datasets: raise ConfigurationError( f"invalid 'dataset_for_vocab_creation' {dataset}") logger.info( "From dataset instances, %s will be considered for vocabulary creation.", ", ".join(datasets_for_vocab_creation)) if recover and os.path.exists( os.path.join(serialization_dir, "vocabulary")): vocab = Vocabulary.from_files( os.path.join(serialization_dir, "vocabulary")) params.pop("vocabulary", {}) else: vocab = Vocabulary.from_params(params.pop( "vocabulary", {}), (instance for key, dataset in all_datasets.items() if key in datasets_for_vocab_creation for instance in dataset)) model = Model.from_params(vocab=vocab, params=params.pop('model')) # If vocab extension is ON for training, embedding extension should also be # done. If vocab and embeddings are already in sync, it would be a no-op. model.extend_embedder_vocab() # Initializing the model can have side effect of expanding the vocabulary vocab.save_to_files(os.path.join(serialization_dir, "vocabulary")) iterator = DataIterator.from_params(params.pop("iterator")) iterator.index_with(model.vocab) validation_iterator_params = params.pop("validation_iterator", None) if validation_iterator_params: validation_iterator = DataIterator.from_params( validation_iterator_params) validation_iterator.index_with(model.vocab) else: validation_iterator = None train_data = all_datasets['train'] validation_data = all_datasets.get('validation') test_data = all_datasets.get('test') trainer_params = params.pop("trainer") no_grad_regexes = trainer_params.pop("no_grad", ()) for name, parameter in model.named_parameters(): if any(re.search(regex, name) for regex in no_grad_regexes): parameter.requires_grad_(False) frozen_parameter_names, tunable_parameter_names = \ get_frozen_and_tunable_parameter_names(model) logger.info("Following parameters are Frozen (without gradient):") for name in frozen_parameter_names: logger.info(name) logger.info("Following parameters are Tunable (with gradient):") for name in tunable_parameter_names: logger.info(name) params = trainer_params patience = params.pop_int("patience", None) validation_metric = params.pop("validation_metric", "-loss") shuffle = params.pop_bool("shuffle", True) num_epochs = params.pop_int("num_epochs", 20) cuda_device = parse_cuda_device(params.pop("cuda_device", -1)) grad_norm = params.pop_float("grad_norm", None) grad_clipping = params.pop_float("grad_clipping", None) lr_scheduler_params = params.pop("learning_rate_scheduler", None) momentum_scheduler_params = params.pop("momentum_scheduler", None) if isinstance(cuda_device, list): model_device = cuda_device[0] else: model_device = cuda_device if model_device >= 0: # Moving model to GPU here so that the optimizer state gets constructed on # the right device. model = model.cuda(model_device) parameters = [[n, p] for n, p in model.named_parameters() if p.requires_grad] optimizer = Optimizer.from_params(parameters, params.pop("optimizer")) if "moving_average" in params: moving_average = MovingAverage.from_params( params.pop("moving_average"), parameters=parameters) else: moving_average = None if lr_scheduler_params: lr_scheduler = LearningRateScheduler.from_params( optimizer, lr_scheduler_params) else: lr_scheduler = None if momentum_scheduler_params: momentum_scheduler = MomentumScheduler.from_params( optimizer, momentum_scheduler_params) else: momentum_scheduler = None if 'checkpointer' in params: if 'keep_serialized_model_every_num_seconds' in params or \ 'num_serialized_models_to_keep' in params: raise ConfigurationError( "Checkpointer may be initialized either from the 'checkpointer' key or from the " "keys 'num_serialized_models_to_keep' and 'keep_serialized_model_every_num_seconds'" " but the passed config uses both methods.") checkpointer = Checkpointer.from_params(params.pop("checkpointer")) else: num_serialized_models_to_keep = params.pop_int( "num_serialized_models_to_keep", 20) keep_serialized_model_every_num_seconds = params.pop_int( "keep_serialized_model_every_num_seconds", None) checkpointer = Checkpointer( serialization_dir=serialization_dir, num_serialized_models_to_keep=num_serialized_models_to_keep, keep_serialized_model_every_num_seconds= keep_serialized_model_every_num_seconds) model_save_interval = params.pop_float("model_save_interval", None) summary_interval = params.pop_int("summary_interval", 100) histogram_interval = params.pop_int("histogram_interval", None) should_log_parameter_statistics = params.pop_bool( "should_log_parameter_statistics", True) should_log_learning_rate = params.pop_bool("should_log_learning_rate", False) log_batch_size_period = params.pop_int("log_batch_size_period", None) return cls( model, optimizer, iterator, train_data, validation_data, patience=patience, validation_metric=validation_metric, validation_iterator=validation_iterator, max_src_len=max_src_len, shuffle=shuffle, num_epochs=num_epochs, serialization_dir=serialization_dir, cuda_device=cuda_device, grad_norm=grad_norm, grad_clipping=grad_clipping, learning_rate_scheduler=lr_scheduler, momentum_scheduler=momentum_scheduler, checkpointer=checkpointer, model_save_interval=model_save_interval, summary_interval=summary_interval, histogram_interval=histogram_interval, should_log_parameter_statistics=should_log_parameter_statistics, should_log_learning_rate=should_log_learning_rate, log_batch_size_period=log_batch_size_period, moving_average=moving_average, batch_size=iterator._batch_size)