def build_task_and_model(model_dir, wait_k): model_dirs = flatten_string_list(model_dir) cfgs = ModelConfigs.load(model_dirs[0]) cfgs["task.params"]["wait_k"] = wait_k task = build_task(cfgs) models = [] for md in model_dirs: models.append(task.build_model(ModelConfigs.load(md))) restore_checkpoint_if_possible(models[-1], md) return task, models
def __init__(self, model, task, metric, estop_patience=None, best_checkpoint_path=None, auto_average_checkpoints=True, best_avg_checkpoint_path=None, top_checkpoints_to_keep=0): """ Initializes manager for arbitrary evaluation strategies. Args: model: The custom keras model (inherent BaseModel). task: The custom task. metric: The evaluation metric object. estop_patience: An integer, the training process will automatically shut down until the program fail to acquire a better metric score anymore if `early_stop_patience` greater than 0. best_checkpoint_path: The path for checkpoints with best metric scores if provided, otherwise, default \"`model_dir`_best\" will be used. best_avg_checkpoint_path: The path to saving the averaged checkpoints. auto_average_checkpoints: A boolean, whether to do checkpoint average on all model weights. An extra directory for averaged weights will be created. It is only available when `eval_best_checkpoint_path` is provided. top_checkpoints_to_keep: An integer, the maximum number of checkpoints to be saved (`max_to_keep` for checkpoint manager), and the number of latest checkpoints to be averaged if `eval_auto_average_checkpoints` is True. If <= 0, no more checkpoints will be saved. """ self._model = model self._task = task self._metric = metric self._estop_patience = estop_patience self._best_checkpoint_path = best_checkpoint_path self._auto_average_checkpoints = auto_average_checkpoints self._best_avg_checkpoint_path = best_avg_checkpoint_path self._top_checkpoints_to_keep = top_checkpoints_to_keep self._keep_best_ckpt_saver = None self._average_ckpt_saver = None if self._top_checkpoints_to_keep and self._top_checkpoints_to_keep > 0: self._keep_best_ckpt_saver = KeepBestCheckpointSaver( model=self._model, directory=self._best_checkpoint_path, metric=self._metric, max_to_keep=self._top_checkpoints_to_keep) ModelConfigs.dump(self._task.model_configs(self._model), self._keep_best_ckpt_saver.directory) if self._auto_average_checkpoints: self._average_ckpt_saver = AverageCheckpointSaver( model=self._model, directory=self._best_avg_checkpoint_path, metric=self._metric, max_to_keep=self._top_checkpoints_to_keep) ModelConfigs.dump(self._task.model_configs(self._model), self._average_ckpt_saver.directory) self._best_metric_result = None self._bad_count = 0
def _build_and_restore_model(self): """ Build a single model or ensemble model. """ model_dirs = flatten_string_list(self.model_dir) if len(model_dirs) == 1: model = self.model stat = restore_checkpoint_if_possible(model, model_dirs[0]) if not stat: logging.info("WARNING: Fail to restore checkpoint from {}. " "We assume this was done on purpose. ".format( model_dirs[0])) else: logging.info( "We assume models for ensemble are all based on the same task." ) multiple_models = [] for idx, one_model_dir in enumerate(model_dirs): name_prefix = "ensemble_{}".format(idx) logging.info("Create model for {} from {}".format( name_prefix, one_model_dir)) cfg = ModelConfigs.load(one_model_dir) this_model = self.task.build_model(cfg, name=name_prefix) stat = restore_checkpoint_if_possible(this_model, one_model_dir) if not stat: logging.info( "WARNING: Fail to restore checkpoint from {}. " "We assume this was done on purpose. ".format( one_model_dir)) multiple_models.append(this_model) model = EncoderDecoderEnsembleModel.new(multiple_models) return model
def _pre_load_args(args): cfg_file_args = yaml_load_checking( load_from_config_path( flatten_string_list( getattr(args, flags_core.DEFAULT_CONFIG_FLAG.name)))) model_dirs = flatten_string_list(args.model_dir or cfg_file_args.get("model_dir", None)) hparams_set = args.hparams_set if hparams_set is None: hparams_set = cfg_file_args.get("hparams_set", None) predefined_parameters = get_hyper_parameters(hparams_set) formatted_parameters = {} if "model.class" in predefined_parameters: formatted_parameters["model.class"] = predefined_parameters.pop( "model.class") if "model" in predefined_parameters: formatted_parameters["model"] = predefined_parameters.pop("model") if "model.params" in predefined_parameters: formatted_parameters["model.params"] = predefined_parameters.pop( "model.params") if len(predefined_parameters) > 0: formatted_parameters["entry.params"] = predefined_parameters try: model_cfgs = ModelConfigs.load(model_dirs[0]) return deep_merge_dict( deep_merge_dict(model_cfgs, formatted_parameters), cfg_file_args) except Exception: return deep_merge_dict(formatted_parameters, cfg_file_args)
def convert(cls, from_path, to_path): if not tf.io.gfile.exists(from_path): path = cls.download(from_path) if path is None: raise ValueError( f"Fail to find model to download: {from_path}") from_path = path try: cfgs = cls.convert_model_config(from_path) except NotImplementedError: cfgs = {} try: cfgs.update(cls.convert_task_config(from_path)) except NotImplementedError: pass ModelConfigs.dump(cfgs, to_path) cls.convert_checkpoint(from_path, to_path)
def on_train_begin(self, logs=None): super(CustomCheckpointCallback, self).on_train_begin(logs) ModelConfigs.dump(self._model_configs, output_dir=self._checkpoint_manager.directory)
def run(self): """ Repeats to call validator's validate function if new checkponts are observed. Step 1: Build model. Step 2: Fetch training status. while True: Step 3: Restore checkpoints. Step 4: Validate. """ if self.task is None or self.model is None: model_cfg_waiting_rounds = self._maximum_waiting_time // self._waiting_interval for i in range(model_cfg_waiting_rounds): try: args = ModelConfigs.load(self._model_dir) break except FileNotFoundError: logging.info( f"Fail to load model configs from directory: {self.model_dir}. " f"Wait for another {self._waiting_interval}s, " f"patience={model_cfg_waiting_rounds - 1 - i}.") time.sleep(self._waiting_interval) self._task = build_task(args) self._model = self.task.build_model(args) # initialize the checkpoint manager saver = compat.get_saver_or_default(self.model, self.model_dir) # enable tensorboard if self._tb_log_dir is None: self._tb_log_dir = os.path.join( self.model_dir, "validation_{}".format(int(time.time()))) file_writer = tf.summary.create_file_writer(self._tb_log_dir) file_writer.set_as_default() # create training self._validator.build(self.strategy, self.task, self.model) last_triggered_step = None accumulated_waiting_time = 0 this_waiting_interval = next_waiting_interval = self._waiting_interval while True: bad_cnt = 0 while bad_cnt < 5: try: ckpt_state = tf.train.get_checkpoint_state(self.model_dir) break except ValueError: bad_cnt += 1 time.sleep(5) logging.info(traceback.format_exc()) if bad_cnt >= 5: ckpt_state = tf.train.get_checkpoint_state( self.model_dir) ckpts_to_be_restore = None if ckpt_state is None: logging.info( f"No checkpoint in directory: {self.model_dir}. Please wait." ) else: all_ckpts = [ (t, x) for t, x in zip(ckpt_state.all_model_checkpoint_timestamps, ckpt_state.all_model_checkpoint_paths) ] global_steps_to_be_restore = [] ckpts_to_be_restore = [] for ckpt in all_ckpts[::-1]: step = compat.hack_global_step(ckpt[1]) if last_triggered_step is None or step > last_triggered_step: ckpts_to_be_restore.insert(0, ckpt) global_steps_to_be_restore.insert(0, step) if len(ckpts_to_be_restore) > 0: accumulated_waiting_time = 0 _start_time = time.time() for step, (timestamp, ckpt) in zip(global_steps_to_be_restore, ckpts_to_be_restore): stat = saver.restore(ckpt) if not stat: logging.info( f"Fail to restore checkpoint from {ckpt}. Skip...") continue logging.info( f"Checkpoint with global_step={step} triggered on {timestamp}" ) self._validator.validate(step) last_triggered_step = step this_waiting_interval = max( this_waiting_interval - int(time.time() - _start_time), 10) tf.summary.flush(file_writer) if ckpts_to_be_restore is None: pass elif len(ckpts_to_be_restore) > 1: this_waiting_interval = int(this_waiting_interval * 1. * (len(ckpts_to_be_restore) // 2) / len(ckpts_to_be_restore)) next_waiting_interval = this_waiting_interval elif len(ckpts_to_be_restore) == 0: next_waiting_interval = min( int(this_waiting_interval * 4. / 3.), self._waiting_interval) this_waiting_interval = this_waiting_interval // 2 accumulated_waiting_time += this_waiting_interval if accumulated_waiting_time > self._maximum_waiting_time: logging.info( f"Waited for maximum patience: {self._maximum_waiting_time}s" ) break time.sleep(this_waiting_interval) this_waiting_interval = next_waiting_interval
def _build_task_model(strategy, model_dir, batch_size): with training_utils.get_strategy_scope(strategy): model_configs = ModelConfigs.load(model_dir) task = build_task(model_configs, batch_size=batch_size) model = task.build_model(model_configs) return task, model
# distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import sys import tensorflow as tf from neurst.layers.quantization import QuantLayer from neurst.models.transformer import Transformer from neurst.tasks import build_task from neurst.utils.checkpoints import restore_checkpoint_if_possible from neurst.utils.configurable import ModelConfigs model_dir = sys.argv[1] model_configs = ModelConfigs.load(model_dir) QuantLayer.global_init(model_configs["enable_quant"], **model_configs["quant_params"]) task = build_task(model_configs) model: Transformer = task.build_model(model_configs) restore_checkpoint_if_possible(model, model_dir) clip_max = model._encoder._stacking_layers[0][1]._layer._conv1.traced[ "kernel"].clip_max weight_clip_max = tf.maximum(clip_max, 0.0) weight_clip_max = tf.cast(weight_clip_max, tf.float32) bits_tmp = float(2**(QuantLayer.quant_bits - 1)) weight_clip_min = -weight_clip_max * bits_tmp / (bits_tmp - 1) print("The quantized weight of encoder layer0's first ffn")