Esempio n. 1
0
def build_task_and_model(model_dir, wait_k):
    model_dirs = flatten_string_list(model_dir)
    cfgs = ModelConfigs.load(model_dirs[0])
    cfgs["task.params"]["wait_k"] = wait_k
    task = build_task(cfgs)
    models = []
    for md in model_dirs:
        models.append(task.build_model(ModelConfigs.load(md)))
        restore_checkpoint_if_possible(models[-1], md)
    return task, models
Esempio n. 2
0
    def __init__(self,
                 model,
                 task,
                 metric,
                 estop_patience=None,
                 best_checkpoint_path=None,
                 auto_average_checkpoints=True,
                 best_avg_checkpoint_path=None,
                 top_checkpoints_to_keep=0):
        """ Initializes manager for arbitrary evaluation strategies.

        Args:
            model: The custom keras model (inherent BaseModel).
            task: The custom task.
            metric: The evaluation metric object.
            estop_patience: An integer, the training process will automatically shut down until the program
                fail to acquire a better metric score anymore if `early_stop_patience` greater than 0.
            best_checkpoint_path: The path for checkpoints with best metric scores if provided,
                otherwise, default \"`model_dir`_best\" will be used.
            best_avg_checkpoint_path: The path to saving the averaged checkpoints.
            auto_average_checkpoints: A boolean, whether to do checkpoint average on all model weights.
                An extra directory for averaged weights will be created. It is only available when
                `eval_best_checkpoint_path` is provided.
            top_checkpoints_to_keep: An integer, the maximum number of checkpoints to be saved
                (`max_to_keep` for checkpoint manager), and the number of latest checkpoints to be averaged
                if `eval_auto_average_checkpoints` is True. If <= 0, no more checkpoints will be saved.
        """
        self._model = model
        self._task = task
        self._metric = metric
        self._estop_patience = estop_patience
        self._best_checkpoint_path = best_checkpoint_path
        self._auto_average_checkpoints = auto_average_checkpoints
        self._best_avg_checkpoint_path = best_avg_checkpoint_path
        self._top_checkpoints_to_keep = top_checkpoints_to_keep
        self._keep_best_ckpt_saver = None
        self._average_ckpt_saver = None
        if self._top_checkpoints_to_keep and self._top_checkpoints_to_keep > 0:
            self._keep_best_ckpt_saver = KeepBestCheckpointSaver(
                model=self._model,
                directory=self._best_checkpoint_path,
                metric=self._metric,
                max_to_keep=self._top_checkpoints_to_keep)
            ModelConfigs.dump(self._task.model_configs(self._model),
                              self._keep_best_ckpt_saver.directory)
            if self._auto_average_checkpoints:
                self._average_ckpt_saver = AverageCheckpointSaver(
                    model=self._model,
                    directory=self._best_avg_checkpoint_path,
                    metric=self._metric,
                    max_to_keep=self._top_checkpoints_to_keep)
                ModelConfigs.dump(self._task.model_configs(self._model),
                                  self._average_ckpt_saver.directory)
        self._best_metric_result = None
        self._bad_count = 0
Esempio n. 3
0
 def _build_and_restore_model(self):
     """ Build a single model or ensemble model. """
     model_dirs = flatten_string_list(self.model_dir)
     if len(model_dirs) == 1:
         model = self.model
         stat = restore_checkpoint_if_possible(model, model_dirs[0])
         if not stat:
             logging.info("WARNING: Fail to restore checkpoint from {}. "
                          "We assume this was done on purpose. ".format(
                              model_dirs[0]))
     else:
         logging.info(
             "We assume models for ensemble are all based on the same task."
         )
         multiple_models = []
         for idx, one_model_dir in enumerate(model_dirs):
             name_prefix = "ensemble_{}".format(idx)
             logging.info("Create model for {} from {}".format(
                 name_prefix, one_model_dir))
             cfg = ModelConfigs.load(one_model_dir)
             this_model = self.task.build_model(cfg, name=name_prefix)
             stat = restore_checkpoint_if_possible(this_model,
                                                   one_model_dir)
             if not stat:
                 logging.info(
                     "WARNING: Fail to restore checkpoint from {}. "
                     "We assume this was done on purpose. ".format(
                         one_model_dir))
             multiple_models.append(this_model)
         model = EncoderDecoderEnsembleModel.new(multiple_models)
     return model
Esempio n. 4
0
def _pre_load_args(args):
    cfg_file_args = yaml_load_checking(
        load_from_config_path(
            flatten_string_list(
                getattr(args, flags_core.DEFAULT_CONFIG_FLAG.name))))
    model_dirs = flatten_string_list(args.model_dir
                                     or cfg_file_args.get("model_dir", None))
    hparams_set = args.hparams_set
    if hparams_set is None:
        hparams_set = cfg_file_args.get("hparams_set", None)
    predefined_parameters = get_hyper_parameters(hparams_set)
    formatted_parameters = {}
    if "model.class" in predefined_parameters:
        formatted_parameters["model.class"] = predefined_parameters.pop(
            "model.class")
    if "model" in predefined_parameters:
        formatted_parameters["model"] = predefined_parameters.pop("model")
    if "model.params" in predefined_parameters:
        formatted_parameters["model.params"] = predefined_parameters.pop(
            "model.params")
    if len(predefined_parameters) > 0:
        formatted_parameters["entry.params"] = predefined_parameters

    try:
        model_cfgs = ModelConfigs.load(model_dirs[0])
        return deep_merge_dict(
            deep_merge_dict(model_cfgs, formatted_parameters), cfg_file_args)
    except Exception:
        return deep_merge_dict(formatted_parameters, cfg_file_args)
Esempio n. 5
0
 def convert(cls, from_path, to_path):
     if not tf.io.gfile.exists(from_path):
         path = cls.download(from_path)
         if path is None:
             raise ValueError(
                 f"Fail to find model to download: {from_path}")
         from_path = path
     try:
         cfgs = cls.convert_model_config(from_path)
     except NotImplementedError:
         cfgs = {}
     try:
         cfgs.update(cls.convert_task_config(from_path))
     except NotImplementedError:
         pass
     ModelConfigs.dump(cfgs, to_path)
     cls.convert_checkpoint(from_path, to_path)
Esempio n. 6
0
 def on_train_begin(self, logs=None):
     super(CustomCheckpointCallback, self).on_train_begin(logs)
     ModelConfigs.dump(self._model_configs,
                       output_dir=self._checkpoint_manager.directory)
Esempio n. 7
0
    def run(self):
        """ Repeats to call validator's validate function if new checkponts are observed.

        Step 1: Build model.
        Step 2: Fetch training status.
        while True:
            Step 3: Restore checkpoints.
            Step 4: Validate.
        """
        if self.task is None or self.model is None:
            model_cfg_waiting_rounds = self._maximum_waiting_time // self._waiting_interval
            for i in range(model_cfg_waiting_rounds):
                try:
                    args = ModelConfigs.load(self._model_dir)
                    break
                except FileNotFoundError:
                    logging.info(
                        f"Fail to load model configs from directory: {self.model_dir}. "
                        f"Wait for another {self._waiting_interval}s, "
                        f"patience={model_cfg_waiting_rounds - 1 - i}.")
                    time.sleep(self._waiting_interval)
            self._task = build_task(args)
            self._model = self.task.build_model(args)
        # initialize the checkpoint manager
        saver = compat.get_saver_or_default(self.model, self.model_dir)
        # enable tensorboard
        if self._tb_log_dir is None:
            self._tb_log_dir = os.path.join(
                self.model_dir, "validation_{}".format(int(time.time())))
        file_writer = tf.summary.create_file_writer(self._tb_log_dir)
        file_writer.set_as_default()
        # create training
        self._validator.build(self.strategy, self.task, self.model)
        last_triggered_step = None
        accumulated_waiting_time = 0
        this_waiting_interval = next_waiting_interval = self._waiting_interval
        while True:
            bad_cnt = 0
            while bad_cnt < 5:
                try:
                    ckpt_state = tf.train.get_checkpoint_state(self.model_dir)
                    break
                except ValueError:
                    bad_cnt += 1
                    time.sleep(5)
                    logging.info(traceback.format_exc())
                    if bad_cnt >= 5:
                        ckpt_state = tf.train.get_checkpoint_state(
                            self.model_dir)

            ckpts_to_be_restore = None
            if ckpt_state is None:
                logging.info(
                    f"No checkpoint in directory: {self.model_dir}. Please wait."
                )
            else:
                all_ckpts = [
                    (t, x)
                    for t, x in zip(ckpt_state.all_model_checkpoint_timestamps,
                                    ckpt_state.all_model_checkpoint_paths)
                ]
                global_steps_to_be_restore = []
                ckpts_to_be_restore = []
                for ckpt in all_ckpts[::-1]:
                    step = compat.hack_global_step(ckpt[1])
                    if last_triggered_step is None or step > last_triggered_step:
                        ckpts_to_be_restore.insert(0, ckpt)
                        global_steps_to_be_restore.insert(0, step)
                if len(ckpts_to_be_restore) > 0:
                    accumulated_waiting_time = 0
                _start_time = time.time()
                for step, (timestamp, ckpt) in zip(global_steps_to_be_restore,
                                                   ckpts_to_be_restore):
                    stat = saver.restore(ckpt)
                    if not stat:
                        logging.info(
                            f"Fail to restore checkpoint from {ckpt}. Skip...")
                        continue
                    logging.info(
                        f"Checkpoint with global_step={step} triggered on {timestamp}"
                    )
                    self._validator.validate(step)
                    last_triggered_step = step
                this_waiting_interval = max(
                    this_waiting_interval - int(time.time() - _start_time), 10)
                tf.summary.flush(file_writer)
            if ckpts_to_be_restore is None:
                pass
            elif len(ckpts_to_be_restore) > 1:
                this_waiting_interval = int(this_waiting_interval * 1. *
                                            (len(ckpts_to_be_restore) // 2) /
                                            len(ckpts_to_be_restore))
                next_waiting_interval = this_waiting_interval
            elif len(ckpts_to_be_restore) == 0:
                next_waiting_interval = min(
                    int(this_waiting_interval * 4. / 3.),
                    self._waiting_interval)
                this_waiting_interval = this_waiting_interval // 2
            accumulated_waiting_time += this_waiting_interval
            if accumulated_waiting_time > self._maximum_waiting_time:
                logging.info(
                    f"Waited for maximum patience: {self._maximum_waiting_time}s"
                )
                break
            time.sleep(this_waiting_interval)
            this_waiting_interval = next_waiting_interval
Esempio n. 8
0
def _build_task_model(strategy, model_dir, batch_size):
    with training_utils.get_strategy_scope(strategy):
        model_configs = ModelConfigs.load(model_dir)
        task = build_task(model_configs, batch_size=batch_size)
        model = task.build_model(model_configs)
        return task, model
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys

import tensorflow as tf

from neurst.layers.quantization import QuantLayer
from neurst.models.transformer import Transformer
from neurst.tasks import build_task
from neurst.utils.checkpoints import restore_checkpoint_if_possible
from neurst.utils.configurable import ModelConfigs

model_dir = sys.argv[1]
model_configs = ModelConfigs.load(model_dir)
QuantLayer.global_init(model_configs["enable_quant"],
                       **model_configs["quant_params"])
task = build_task(model_configs)
model: Transformer = task.build_model(model_configs)
restore_checkpoint_if_possible(model, model_dir)

clip_max = model._encoder._stacking_layers[0][1]._layer._conv1.traced[
    "kernel"].clip_max

weight_clip_max = tf.maximum(clip_max, 0.0)
weight_clip_max = tf.cast(weight_clip_max, tf.float32)
bits_tmp = float(2**(QuantLayer.quant_bits - 1))
weight_clip_min = -weight_clip_max * bits_tmp / (bits_tmp - 1)

print("The quantized weight of encoder layer0's first ffn")