def build_vocab_file_if_nonexistent( cls, corpus_files: List[str], vocab_file: str, max_vocab_size: int, tokens_with_penalty: Optional[str] = None, is_char_vocab: bool = False, embed_bytes: bool = False, padding_factor: int = 8, ) -> "Dictionary": # https://www.python.org/dev/peps/pep-0484/#forward-references if PathManager.isfile(vocab_file): d = cls.load(vocab_file) print(f"Re-using existing vocab file {vocab_file}. Specified " f"max vocab size of {max_vocab_size} may not be enforced.") return d print(f"Vocab file {vocab_file} does not exist. " "Creating new vocab file at that path.") return cls.build_vocab_file( corpus_files=corpus_files, vocab_file=vocab_file, max_vocab_size=max_vocab_size, tokens_with_penalty=tokens_with_penalty, is_char_vocab=is_char_vocab, embed_bytes=embed_bytes, padding_factor=padding_factor, )
def __call__(self, parser, namespace, values, option_string=None): if PathManager.isfile(values): with PathManager.open(values) as f: argument = f.read().strip() else: argument = values setattr(namespace, self.dest, argument)
def load_teacher_checkpoint(self, teacher_checkpoint_path): if teacher_checkpoint_path is not None and PathManager.isfile( teacher_checkpoint_path): teacher_state = None try: teacher_state = checkpoint_utils.load_checkpoint_to_cpu( teacher_checkpoint_path) except: print("Load teacher model failed!") # load model parameters try: if teacher_state is not None: self.get_teacher_model().load_state_dict( teacher_state["model"], strict=True, args=self.args) print("Load teacher model successfully!") except Exception: raise Exception( "Cannot load teacher model parameters from checkpoint {}; " "please ensure that the architectures match.".format( teacher_checkpoint_path))
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """ Load all training state from a checkpoint file. rank = 0 will load the checkpoint, and then broadcast it to all other ranks. """ extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: if (self.data_parallel_rank == 0 # TPUs don't support broadcast yet, so load checkpoints # on every worker for now or self.tpu): state = checkpoint_utils.load_checkpoint_to_cpu(filename) last_optim_state = state.get("last_optimizer_state", None) # If doing zero_sharding, do not broadcast global optimizer # state. Later we will broadcast sharded states to each rank # to avoid memory from exploding. if (self.cfg.distributed_training.zero_sharding == "os" and "last_optimizer_state" in state and self.data_parallel_world_size > 1): state["last_optimizer_state"] = "SHARDED" else: last_optim_state = None state = None if (self.data_parallel_world_size > 1 # disable on TPUs until they support broadcast and not self.tpu): state = distributed_utils.broadcast_object( state, src_rank=0, group=self.data_parallel_process_group, dist_device=self.device, ) if self.data_parallel_rank > 0: last_optim_state = state.get("last_optimizer_state", None) # load model parameters try: self.get_model().load_state_dict(state["model"], strict=True, model_cfg=self.cfg.model) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format( filename)) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) if self.data_parallel_world_size > 1: last_optim_state = self.optimizer.broadcast_global_state_dict( last_optim_state) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] logger.info("loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) if "previous_training_time" in extra_state: self._previous_training_time = extra_state[ "previous_training_time"] self._start_time = time.time() self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() else: logger.info("no existing checkpoint found {}".format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: state = checkpoint_utils.load_checkpoint_to_cpu(filename) # load model parameters try: self.get_model().load_state_dict(state["model"], strict=True, args=self.args) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception: import copy new_state = copy.deepcopy(state["model"]) for key in state["model"]: if 'encoder.layers.11' in key: new_key = key.replace('11', '12') new_state[new_key] = state["model"][key].clone() else: new_state[key] = state["model"][key].clone() model_state = self.get_model().state_dict() self.get_model().load_state_dict(new_state, strict=False, args=self.args) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=False) print( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match. This may be expected if you are training guided summarization models" .format(filename)) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] logger.info("loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() else: logger.info("no existing checkpoint found {}".format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: state = checkpoint_utils.load_checkpoint_to_cpu(filename) # load model parameters try: self.get_model().load_state_dict( state["model"], strict=True, args=self.args ) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict( state["criterion"], strict=True ) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format(filename) ) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict(last_optim["lr_scheduler_state"]) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] print( "| loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates() ) ) self.lr_step(epoch) if "train_meters" in extra_state and not reset_meters: self.meters.update(extra_state["train_meters"]) del extra_state["train_meters"] # reset TimeMeters, since their start times don't make sense anymore for meter in self.meters.values(): if isinstance(meter, TimeMeter): meter.reset() else: print("| no existing checkpoint found {}".format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: state = checkpoint_utils.load_checkpoint_to_cpu(filename) args = state['args'] # load model parameters model = self.get_model() states = state["model"] if hasattr(args, 'mixout') and args.mixout > 0: for k, v in list(states.items()): if '._params_learned' in k: del states[k] states[k.replace('._params_learned', '')] = v try: model.load_state_dict(states, strict=True, args=self.args) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception as e: print( e, "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format( filename)) try: print( '!!! Training Continued Ignoring The Above Error !!!') self.get_model().load_state_dict(state['model'], strict=False) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict( state['criterion'], strict=False) except: raise Exception('version is %s' % (sys.version_info, )) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] logger.info("loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() else: logger.info("no existing checkpoint found {}".format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """Load all training state from a checkpoint file.""" extra_state, self._optim_history, last_optim_state = None, [], None bexists = PathManager.isfile(filename) if bexists: state = checkpoint_utils.load_checkpoint_to_cpu(filename) # load model parameters try: # print("\t\t\tWARNING!!!!!!\n\n\n\t\t\tYOU DELETE THE w2v_encoder.proj !!!\n\n\n") # del state["model"]['w2v_encoder.proj.weight'] # del state["model"]['w2v_encoder.proj.bias'] # self.get_model().load_state_dict( # state["model"], strict=False, args=self.args # ) self.get_model().load_state_dict(state["model"], strict=True, args=self.args) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format( filename)) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] last_optim_state = state.get("last_optimizer_state", None) if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), "Criterion does not match; please reset the optimizer (--reset-optimizer)." assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), "Optimizer does not match; please reset the optimizer (--reset-optimizer)." if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: epoch = extra_state["train_iterator"]["epoch"] logger.info("loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) if "previous_training_time" in extra_state: self._previous_training_time = extra_state[ "previous_training_time"] self._start_time = time.time() self.lr_step(epoch) if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() else: logger.info("no existing checkpoint found {}".format(filename)) return extra_state
def load_checkpoint( self, filename, reset_optimizer=False, reset_lr_scheduler=False, optimizer_overrides=None, reset_meters=False, ): """ Load all training state from a checkpoint file. rank = 0 will load the checkpoint, and then broadcast it to all other ranks. """ extra_state, self._optim_history, last_optim_state = None, [], None logger.info(f"Preparing to load checkpoint {filename}") is_distributed = self.data_parallel_world_size > 1 bexists = PathManager.isfile(filename) if bexists: load_on_all_ranks = ( self.cfg.checkpoint.load_checkpoint_on_all_dp_ranks # TPUs don't support broadcast yet, so load checkpoints # on every worker for now or self.tpu) if load_on_all_ranks or self.data_parallel_rank == 0: state = checkpoint_utils.load_checkpoint_to_cpu( filename, load_on_all_ranks=load_on_all_ranks) last_optim_state = state.get("last_optimizer_state", None) # If doing zero_sharding, do not broadcast global optimizer # state. Later we will broadcast sharded states to each rank # to avoid memory from exploding. if (not load_on_all_ranks and self.cfg.distributed_training.zero_sharding == "os" and "last_optimizer_state" in state and is_distributed): state["last_optimizer_state"] = "SHARDED" else: last_optim_state = None state = None if is_distributed and not load_on_all_ranks: state = distributed_utils.broadcast_object( state, src_rank=0, group=self.data_parallel_process_group, dist_device=self.device, ) if self.data_parallel_rank > 0: last_optim_state = state.get("last_optimizer_state", None) # load model parameters try: # model_dict = self.model.state_dict() # 1. filter out unnecessary keys # pretrained_dict = {k: v for k, v in state["model"].items() if k in model_dict # and v.size() == model_dict[k].size()} # 2. overwrite entries in the existing state dict # model_dict.update(pretrained_dict) # 3. load the new state dict # self.model.load_state_dict( # model_dict, strict=False, model_cfg=self.cfg.model # ) self.model.load_state_dict(state["model"], strict=True, model_cfg=self.cfg.model) if utils.has_parameters(self.get_criterion()): self.get_criterion().load_state_dict(state["criterion"], strict=True) except Exception: raise Exception( "Cannot load model parameters from checkpoint {}; " "please ensure that the architectures match.".format( filename)) extra_state = state["extra_state"] self._optim_history = state["optimizer_history"] if last_optim_state is not None and not reset_optimizer: # rebuild optimizer after loading model, since params may have changed self._build_optimizer() # only reload optimizer and lr_scheduler if they match last_optim = self._optim_history[-1] assert ( last_optim["criterion_name"] == self.get_criterion().__class__.__name__ ), f"Criterion does not match; please reset the optimizer (--reset-optimizer). {last_optim['criterion_name']} vs {self.get_criterion().__class__.__name__}" assert ( last_optim["optimizer_name"] == self.optimizer.__class__.__name__ ), f"Optimizer does not match; please reset the optimizer (--reset-optimizer). {last_optim['optimizer_name']} vs {self.optimizer.__class__.__name__}" if not reset_lr_scheduler: self.lr_scheduler.load_state_dict( last_optim["lr_scheduler_state"]) if not load_on_all_ranks and is_distributed: last_optim_state = self.optimizer.broadcast_global_state_dict( last_optim_state) self.optimizer.load_state_dict(last_optim_state, optimizer_overrides) self.set_num_updates(last_optim["num_updates"]) if extra_state is not None: itr_state = extra_state["train_iterator"] epoch = itr_state["epoch"] if "previous_training_time" in extra_state: self._previous_training_time = extra_state[ "previous_training_time"] self._start_time = time.time() self.lr_step(epoch) if itr_state.get("version", 1) >= 2 and itr_state["iterations_in_epoch"] == 0: # reset meters at start of epoch reset_meters = True if "metrics" in extra_state and not reset_meters: metrics.load_state_dict(extra_state["metrics"]) # reset TimeMeters, since their start times don't make sense anymore for meter in metrics.get_meters("default"): if isinstance(meter, meters.TimeMeter): meter.reset() logger.info("Loaded checkpoint {} (epoch {} @ {} updates)".format( filename, epoch, self.get_num_updates())) else: logger.info("No existing checkpoint found {}".format(filename)) return extra_state