Ejemplo n.º 1
0
def _pre_load_args(args):
    cfg_file_args = yaml_load_checking(
        load_from_config_path(
            flatten_string_list(
                getattr(args, flags_core.DEFAULT_CONFIG_FLAG.name))))
    model_dirs = flatten_string_list(args.model_dir
                                     or cfg_file_args.get("model_dir", None))
    hparams_set = args.hparams_set
    if hparams_set is None:
        hparams_set = cfg_file_args.get("hparams_set", None)
    predefined_parameters = get_hyper_parameters(hparams_set)
    formatted_parameters = {}
    if "model.class" in predefined_parameters:
        formatted_parameters["model.class"] = predefined_parameters.pop(
            "model.class")
    if "model" in predefined_parameters:
        formatted_parameters["model"] = predefined_parameters.pop("model")
    if "model.params" in predefined_parameters:
        formatted_parameters["model.params"] = predefined_parameters.pop(
            "model.params")
    if len(predefined_parameters) > 0:
        formatted_parameters["entry.params"] = predefined_parameters

    try:
        model_cfgs = ModelConfigs.load(model_dirs[0])
        return deep_merge_dict(
            deep_merge_dict(model_cfgs, formatted_parameters), cfg_file_args)
    except Exception:
        return deep_merge_dict(formatted_parameters, cfg_file_args)
Ejemplo n.º 2
0
 def _build_and_restore_model(self):
     """ Build a single model or ensemble model. """
     model_dirs = flatten_string_list(self.model_dir)
     if len(model_dirs) == 1:
         model = self.model
         stat = restore_checkpoint_if_possible(model, model_dirs[0])
         if not stat:
             logging.info("WARNING: Fail to restore checkpoint from {}. "
                          "We assume this was done on purpose. ".format(
                              model_dirs[0]))
     else:
         logging.info(
             "We assume models for ensemble are all based on the same task."
         )
         multiple_models = []
         for idx, one_model_dir in enumerate(model_dirs):
             name_prefix = "ensemble_{}".format(idx)
             logging.info("Create model for {} from {}".format(
                 name_prefix, one_model_dir))
             cfg = ModelConfigs.load(one_model_dir)
             this_model = self.task.build_model(cfg, name=name_prefix)
             stat = restore_checkpoint_if_possible(this_model,
                                                   one_model_dir)
             if not stat:
                 logging.info(
                     "WARNING: Fail to restore checkpoint from {}. "
                     "We assume this was done on purpose. ".format(
                         one_model_dir))
             multiple_models.append(this_model)
         model = EncoderDecoderEnsembleModel.new(multiple_models)
     return model
Ejemplo n.º 3
0
def average_checkpoints(checkpoints, output_path):
    assert checkpoints
    # Get the checkpoints list from flags and run some basic checks.
    checkpoints = flatten_string_list(checkpoints)
    checkpoints = [c for c in checkpoints if c]
    if not checkpoints:
        raise ValueError("No checkpoints provided for averaging.")
    model_config_yml_path = None
    for c in checkpoints:
        if model_config_yml_path:
            break
        if tf.io.gfile.exists(
                os.path.join(c, ModelConfigs.MODEL_CONFIG_YAML_FILENAME)):
            model_config_yml_path = os.path.join(
                c, ModelConfigs.MODEL_CONFIG_YAML_FILENAME)
    all_checkpoint_paths = []
    for c in checkpoints:
        if tf.io.gfile.isdir(c):
            checkpoint_states = tf.train.get_checkpoint_state(c)
            all_checkpoint_paths.extend(
                checkpoint_list_checking(
                    checkpoint_states.all_model_checkpoint_paths))
        else:
            all_checkpoint_paths.append(c)
    var_values = {}
    var_cnts = {}
    var_name_shape_list = tf.train.list_variables(all_checkpoint_paths[0])
    for ckpt in all_checkpoint_paths:
        logging.info("loading from {}".format(ckpt))
        for var_name, _ in var_name_shape_list:
            if var_name.startswith("_") or var_name.startswith("save_counter"):
                logging.info("ignore {}...".format(var_name))
                continue
            var = tf.train.load_variable(ckpt, var_name)
            fine_name = wrapper_var_name(var_name)
            if fine_name in var_values:
                var_cnts[fine_name] += 1.
                var_values[fine_name] = var * 1. / var_cnts[
                    fine_name] + var_values[fine_name] * (
                        var_cnts[fine_name] - 1.) / var_cnts[fine_name]
            else:
                var_cnts[fine_name] = 1.
                var_values[fine_name] = var
    tf_vars = dict()
    logging.info("Averaged variables: ")
    for var_name in var_values.keys():
        fine_name = wrapper_var_name(var_name)
        assert var_cnts[fine_name] == len(all_checkpoint_paths)
        logging.info(fine_name)
        tf_vars[fine_name] = tf.Variable(initial_value=var_values[fine_name],
                                         trainable=True,
                                         name=fine_name,
                                         dtype=str(
                                             var_values[fine_name].dtype))
    ckpt_saver = tf.train.Checkpoint(**tf_vars)
    ckpt_saver.save(os.path.join(output_path, "ckpt"))
    tf.io.gfile.copy(model_config_yml_path,
                     os.path.join(output_path,
                                  ModelConfigs.MODEL_CONFIG_YAML_FILENAME),
                     overwrite=True)
Ejemplo n.º 4
0
def _main(_):
    arg_parser = flags_core.define_flags(FLAG_LIST, with_config_file=False)
    args, remaining_argv = flags_core.intelligent_parse_flags(
        FLAG_LIST, arg_parser)
    flags_core.verbose_flags(FLAG_LIST, args, remaining_argv)
    average_checkpoints(checkpoints=flatten_string_list(args["checkpoints"]),
                        output_path=args["output_path"])
Ejemplo n.º 5
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if args["pretrain_model"] and isinstance(args["pretrain_model"][0],
                                              dict):
         self._pretrain_v2 = True
         self._pretrain_model = args["pretrain_model"]
         if self._pretrain_variable_pattern:
             logging.info(
                 "Using pretrain model v2 and ignoring pretrain_variable_pattern: "
                 f"{self._pretrain_variable_pattern}")
     else:
         self._pretrain_v2 = False
         self._pretrain_model = flatten_string_list(args["pretrain_model"])
         if self._pretrain_model and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
         assert (
             (self._pretrain_model is None
              and self._pretrain_variable_pattern is None)
             or len(self._pretrain_model) == len(
                 self._pretrain_variable_pattern)
             or len(self._pretrain_model) == 1
         ), ("`pretrain_variable_pattern` must match with `pretrain_model`."
             )
         if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule_args = args
         if compat.IS_PREV_TF_2_4_0:
             self._optimizer = build_optimizer(args)
         else:
             self._optimizer = build_optimizer(args,
                                               clipnorm=self._clip_norm,
                                               clipvalue=self._clip_value)
         assert self._optimizer is not None, "optimizer parameters must be provided for training."
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
     self._freeze_variables = args["freeze_variables"]
Ejemplo n.º 6
0
def glob_tfrecords(file_path):
    _files = flatten_string_list(file_path)
    _features_files = []
    for _file in _files:
        if tf.io.gfile.isdir(_file):
            _features_files.extend(tf.io.gfile.glob(os.path.join(_file, "*train*")))
        elif tf.io.gfile.exists(_file):
            _features_files.append(_file)
        else:
            _features_files.extend(tf.io.gfile.glob(_file + "*"))
    return _features_files
Ejemplo n.º 7
0
def take_one_record(data_path):
    _file_path = flatten_string_list(data_path)[0]
    if tf.io.gfile.isdir(_file_path):
        _feature_file = os.path.join(_file_path, "*train*")
    elif tf.io.gfile.exists(_file_path):
        _feature_file = _file_path
    else:
        _feature_file = _file_path + "*"
    dataset = tf.data.Dataset.list_files([_feature_file], shuffle=False)
    dataset = dataset.interleave(
        lambda f: tf.data.TFRecordDataset(f, buffer_size=128 * 1024 * 1024),
        cycle_length=10,
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
    for x in dataset.take(1):
        example = tf.train.Example()
        example.ParseFromString(x.numpy())
        return example
Ejemplo n.º 8
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_model = flatten_string_list(args["pretrain_model"])
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if self._pretrain_model and self._pretrain_variable_pattern is None:
         self._pretrain_variable_pattern = [None] * len(
             self._pretrain_model)
     assert (
         (self._pretrain_model is None
          and self._pretrain_variable_pattern is None) or len(
              self._pretrain_model) == len(self._pretrain_variable_pattern)
         or len(self._pretrain_model) == 1
     ), ("`pretrain_variable_pattern` must match with `pretrain_model`.")
     if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
         self._pretrain_variable_pattern = [None] * len(
             self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule = build_lr_schedule(args)
         optimizer = build_optimizer(args)
         assert optimizer is not None, "optimizer parameters must be provided for training."
         self._optimizer = _handle_fp16_and_distributed_optimizer(
             optimizer, self._lr_schedule, self._hvd_backend)
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
Ejemplo n.º 9
0
def load_tfrecords(file_path,
                   name_to_features,
                   shuffle=False,
                   deterministic=True,
                   feature_name_mapping=None,
                   map_func=None,
                   sharding_index=0,
                   num_shards=1,
                   auto_shard=False,
                   auxiliary_elements=None) -> tf.data.Dataset:
    """ Loads TFRecords and does autot-sharding according to worker num.

    Args:
        file_path: The TFRecords file path.
        name_to_features: A `dict` mapping feature keys to `FixedLenFeature` or
            `VarLenFeature` values.
        shuffle: Whether to shuffle files.
        deterministic: Whether the outputs need to be produced in deterministic order.
        feature_name_mapping: A dict that maps the names in `name_to_features` to aliases.
        map_func: A callable function to process the data.
        sharding_index: The manually defined index for sharding.
        num_shards: The manually defined number of shards operating in parallel.
        auto_shard: Automatically shard the TFRecord parts if True.
        auxiliary_elements: A dict containing auxiliary elements that will
            append to the data sample.

    Returns: A dataset.
    """
    _features_files = []
    for _file in flatten_string_list(file_path):
        if tf.io.gfile.isdir(_file):
            _features_files.append(os.path.join(_file, "*train*"))
        elif tf.io.gfile.exists(_file):
            _features_files.append(_file)
        else:
            _features_files.append(_file + "*")
    shuffle = (shuffle is True) and (num_shards == 1)
    # Note that it is quite slow when passing a large list to `list_files`
    dataset = tf.data.Dataset.list_files(_features_files, shuffle=shuffle)
    if num_shards > 1:
        logging.info("Shard %d of the whole dataset(total %d workers).",
                     sharding_index, num_shards)
        dataset = dataset.shard(num_shards, sharding_index)
    else:
        # auto sharding
        worker_id, num_workers, strategy = get_distributed_worker_setting()
        if num_workers > 1 and strategy in ["horovod", "byteps"
                                            ] and not shuffle and auto_shard:
            logging.info("Shard %d of the whole dataset(total %d workers).",
                         worker_id, num_workers)
            options = tf.data.Options()
            options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF
            dataset = dataset.with_options(options)
            dataset = dataset.shard(num_workers, worker_id)
    logging.info("Loading TF Records from: ")
    for _f in dataset:
        logging.info(f"   {_f.numpy()}")
    # Read files and interleave results.
    # When training, the order of the examples will be non-deterministic.
    options = tf.data.Options()
    options.experimental_deterministic = deterministic
    dataset = dataset.interleave(
        lambda f: tf.data.TFRecordDataset(f, buffer_size=128 * 1024 * 1024),
        cycle_length=10,
        num_parallel_calls=tf.data.experimental.AUTOTUNE).with_options(options)

    if name_to_features is None:
        return dataset
    return dataset.map(
        lambda x: parse_tfexample(x,
                                  name_to_features,
                                  feature_name_mapping,
                                  map_func,
                                  auxiliary_elements=auxiliary_elements),
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
Ejemplo n.º 10
0
 def __init__(self, args, **kwargs):
     """ Initializes a util class for training neural models. """
     super(Trainer, self).__init__(**kwargs)
     self._tb_log_dir = args["tb_log_dir"]
     self._train_steps = args["train_steps"]
     self._summary_steps = args["summary_steps"]
     self._save_checkpoint_steps = args["save_checkpoint_steps"]
     self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"]
     self._initial_global_step = args["initial_global_step"]
     self._pretrain_variable_pattern = args["pretrain_variable_pattern"]
     if args["pretrain_model"] and isinstance(args["pretrain_model"][0],
                                              dict):
         self._pretrain_v2 = True
         self._pretrain_model = args["pretrain_model"]
         if self._pretrain_variable_pattern:
             logging.info(
                 "Using pretrain model v2 and ignoring pretrain_variable_pattern: "
                 f"{self._pretrain_variable_pattern}")
     else:
         self._pretrain_v2 = False
         self._pretrain_model = flatten_string_list(args["pretrain_model"])
         if args["mask_dir"]:
             self.mask_dir = args["mask_dir"][0]
             # print(self.mask_dir)
             # self.load_mask = np.load(self.mask_dir, allow_pickle=True)
             with open(self.mask_dir, 'rb') as f:
                 self.load_mask = pickle.load(f)
             # i = 0
             # for weight in self.load_mask:
             #     if  i <= 1000:
             #         tf.print(weight.name, output_stream='file://./mask.txt')
             #         if weight.shape.ndims > 0:
             #             tf.print(weight[:1], output_stream='file://./mask.txt', summarize=-1, name=weight.name)
             #         else:
             #             tf.print(weight, output_stream='file://./mask.txt', summarize=-1, name=weight.name)
             #     else:
             #         i += 1
         else:
             self.mask_dir = os.path.join(self.model_dir, "mask.pkl")
             self.load_mask = None
         if self._pretrain_model:
             if self._pretrain_variable_pattern is None:
                 self._pretrain_variable_pattern = [None] * len(
                     self._pretrain_model)
             elif isinstance(self._pretrain_variable_pattern, str):
                 self._pretrain_variable_pattern = [
                     self._pretrain_variable_pattern
                 ]
         assert (
             (self._pretrain_model is None
              and self._pretrain_variable_pattern is None)
             or len(self._pretrain_model) == len(
                 self._pretrain_variable_pattern)
             or len(self._pretrain_model) == 1
         ), ("`pretrain_variable_pattern` must match with `pretrain_model`."
             )
         if self._pretrain_model is not None and self._pretrain_variable_pattern is None:
             self._pretrain_variable_pattern = [None] * len(
                 self._pretrain_model)
     self._update_cycle = args["update_cycle"]
     self._clip_value = args["clip_value"]
     self._clip_norm = args["clip_norm"]
     self._hvd_backend = self.strategy if self.strategy in [
         "byteps", "horovod"
     ] else None
     with training_utils.get_strategy_scope(self.strategy):
         self._criterion = build_criterion(args)
         self._criterion.set_model(self.model)
         self._lr_schedule_args = args
         if compat.IS_PREV_TF_2_4_0:
             self._optimizer = build_optimizer(args)
         else:
             self._optimizer = build_optimizer(args,
                                               clipnorm=self._clip_norm,
                                               clipvalue=self._clip_value)
         assert self._optimizer is not None, "optimizer parameters must be provided for training."
     self._validator = build_validator(args)
     self._experimental_count_batch_num = args[
         "experimental_count_batch_num"]
     self._freeze_variables = args["freeze_variables"]
     self._pruning_schedule = build_pruning_schedule(args)
     self._partial_tuning = args["partial_tuning"]
     self._pruning_variable_pattern = args["pruning_variable_pattern"]
     self._nopruning_variable_pattern = args["nopruning_variable_pattern"]
Ejemplo n.º 11
0
def _args_preload_from_config_files(args):
    cfg_file_args = yaml_load_checking(load_from_config_path(
        flatten_string_list(getattr(args, DEFAULT_CONFIG_FLAG.name, None))))
    return cfg_file_args