def _pre_load_args(args): cfg_file_args = yaml_load_checking( load_from_config_path( flatten_string_list( getattr(args, flags_core.DEFAULT_CONFIG_FLAG.name)))) model_dirs = flatten_string_list(args.model_dir or cfg_file_args.get("model_dir", None)) hparams_set = args.hparams_set if hparams_set is None: hparams_set = cfg_file_args.get("hparams_set", None) predefined_parameters = get_hyper_parameters(hparams_set) formatted_parameters = {} if "model.class" in predefined_parameters: formatted_parameters["model.class"] = predefined_parameters.pop( "model.class") if "model" in predefined_parameters: formatted_parameters["model"] = predefined_parameters.pop("model") if "model.params" in predefined_parameters: formatted_parameters["model.params"] = predefined_parameters.pop( "model.params") if len(predefined_parameters) > 0: formatted_parameters["entry.params"] = predefined_parameters try: model_cfgs = ModelConfigs.load(model_dirs[0]) return deep_merge_dict( deep_merge_dict(model_cfgs, formatted_parameters), cfg_file_args) except Exception: return deep_merge_dict(formatted_parameters, cfg_file_args)
def _build_and_restore_model(self): """ Build a single model or ensemble model. """ model_dirs = flatten_string_list(self.model_dir) if len(model_dirs) == 1: model = self.model stat = restore_checkpoint_if_possible(model, model_dirs[0]) if not stat: logging.info("WARNING: Fail to restore checkpoint from {}. " "We assume this was done on purpose. ".format( model_dirs[0])) else: logging.info( "We assume models for ensemble are all based on the same task." ) multiple_models = [] for idx, one_model_dir in enumerate(model_dirs): name_prefix = "ensemble_{}".format(idx) logging.info("Create model for {} from {}".format( name_prefix, one_model_dir)) cfg = ModelConfigs.load(one_model_dir) this_model = self.task.build_model(cfg, name=name_prefix) stat = restore_checkpoint_if_possible(this_model, one_model_dir) if not stat: logging.info( "WARNING: Fail to restore checkpoint from {}. " "We assume this was done on purpose. ".format( one_model_dir)) multiple_models.append(this_model) model = EncoderDecoderEnsembleModel.new(multiple_models) return model
def average_checkpoints(checkpoints, output_path): assert checkpoints # Get the checkpoints list from flags and run some basic checks. checkpoints = flatten_string_list(checkpoints) checkpoints = [c for c in checkpoints if c] if not checkpoints: raise ValueError("No checkpoints provided for averaging.") model_config_yml_path = None for c in checkpoints: if model_config_yml_path: break if tf.io.gfile.exists( os.path.join(c, ModelConfigs.MODEL_CONFIG_YAML_FILENAME)): model_config_yml_path = os.path.join( c, ModelConfigs.MODEL_CONFIG_YAML_FILENAME) all_checkpoint_paths = [] for c in checkpoints: if tf.io.gfile.isdir(c): checkpoint_states = tf.train.get_checkpoint_state(c) all_checkpoint_paths.extend( checkpoint_list_checking( checkpoint_states.all_model_checkpoint_paths)) else: all_checkpoint_paths.append(c) var_values = {} var_cnts = {} var_name_shape_list = tf.train.list_variables(all_checkpoint_paths[0]) for ckpt in all_checkpoint_paths: logging.info("loading from {}".format(ckpt)) for var_name, _ in var_name_shape_list: if var_name.startswith("_") or var_name.startswith("save_counter"): logging.info("ignore {}...".format(var_name)) continue var = tf.train.load_variable(ckpt, var_name) fine_name = wrapper_var_name(var_name) if fine_name in var_values: var_cnts[fine_name] += 1. var_values[fine_name] = var * 1. / var_cnts[ fine_name] + var_values[fine_name] * ( var_cnts[fine_name] - 1.) / var_cnts[fine_name] else: var_cnts[fine_name] = 1. var_values[fine_name] = var tf_vars = dict() logging.info("Averaged variables: ") for var_name in var_values.keys(): fine_name = wrapper_var_name(var_name) assert var_cnts[fine_name] == len(all_checkpoint_paths) logging.info(fine_name) tf_vars[fine_name] = tf.Variable(initial_value=var_values[fine_name], trainable=True, name=fine_name, dtype=str( var_values[fine_name].dtype)) ckpt_saver = tf.train.Checkpoint(**tf_vars) ckpt_saver.save(os.path.join(output_path, "ckpt")) tf.io.gfile.copy(model_config_yml_path, os.path.join(output_path, ModelConfigs.MODEL_CONFIG_YAML_FILENAME), overwrite=True)
def _main(_): arg_parser = flags_core.define_flags(FLAG_LIST, with_config_file=False) args, remaining_argv = flags_core.intelligent_parse_flags( FLAG_LIST, arg_parser) flags_core.verbose_flags(FLAG_LIST, args, remaining_argv) average_checkpoints(checkpoints=flatten_string_list(args["checkpoints"]), output_path=args["output_path"])
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if args["pretrain_model"] and isinstance(args["pretrain_model"][0], dict): self._pretrain_v2 = True self._pretrain_model = args["pretrain_model"] if self._pretrain_variable_pattern: logging.info( "Using pretrain model v2 and ignoring pretrain_variable_pattern: " f"{self._pretrain_variable_pattern}") else: self._pretrain_v2 = False self._pretrain_model = flatten_string_list(args["pretrain_model"]) if self._pretrain_model and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len(self._pretrain_model) == len( self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`." ) if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule_args = args if compat.IS_PREV_TF_2_4_0: self._optimizer = build_optimizer(args) else: self._optimizer = build_optimizer(args, clipnorm=self._clip_norm, clipvalue=self._clip_value) assert self._optimizer is not None, "optimizer parameters must be provided for training." self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"] self._freeze_variables = args["freeze_variables"]
def glob_tfrecords(file_path): _files = flatten_string_list(file_path) _features_files = [] for _file in _files: if tf.io.gfile.isdir(_file): _features_files.extend(tf.io.gfile.glob(os.path.join(_file, "*train*"))) elif tf.io.gfile.exists(_file): _features_files.append(_file) else: _features_files.extend(tf.io.gfile.glob(_file + "*")) return _features_files
def take_one_record(data_path): _file_path = flatten_string_list(data_path)[0] if tf.io.gfile.isdir(_file_path): _feature_file = os.path.join(_file_path, "*train*") elif tf.io.gfile.exists(_file_path): _feature_file = _file_path else: _feature_file = _file_path + "*" dataset = tf.data.Dataset.list_files([_feature_file], shuffle=False) dataset = dataset.interleave( lambda f: tf.data.TFRecordDataset(f, buffer_size=128 * 1024 * 1024), cycle_length=10, num_parallel_calls=tf.data.experimental.AUTOTUNE) for x in dataset.take(1): example = tf.train.Example() example.ParseFromString(x.numpy()) return example
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_model = flatten_string_list(args["pretrain_model"]) self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if self._pretrain_model and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len( self._pretrain_model) == len(self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`.") if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule = build_lr_schedule(args) optimizer = build_optimizer(args) assert optimizer is not None, "optimizer parameters must be provided for training." self._optimizer = _handle_fp16_and_distributed_optimizer( optimizer, self._lr_schedule, self._hvd_backend) self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"]
def load_tfrecords(file_path, name_to_features, shuffle=False, deterministic=True, feature_name_mapping=None, map_func=None, sharding_index=0, num_shards=1, auto_shard=False, auxiliary_elements=None) -> tf.data.Dataset: """ Loads TFRecords and does autot-sharding according to worker num. Args: file_path: The TFRecords file path. name_to_features: A `dict` mapping feature keys to `FixedLenFeature` or `VarLenFeature` values. shuffle: Whether to shuffle files. deterministic: Whether the outputs need to be produced in deterministic order. feature_name_mapping: A dict that maps the names in `name_to_features` to aliases. map_func: A callable function to process the data. sharding_index: The manually defined index for sharding. num_shards: The manually defined number of shards operating in parallel. auto_shard: Automatically shard the TFRecord parts if True. auxiliary_elements: A dict containing auxiliary elements that will append to the data sample. Returns: A dataset. """ _features_files = [] for _file in flatten_string_list(file_path): if tf.io.gfile.isdir(_file): _features_files.append(os.path.join(_file, "*train*")) elif tf.io.gfile.exists(_file): _features_files.append(_file) else: _features_files.append(_file + "*") shuffle = (shuffle is True) and (num_shards == 1) # Note that it is quite slow when passing a large list to `list_files` dataset = tf.data.Dataset.list_files(_features_files, shuffle=shuffle) if num_shards > 1: logging.info("Shard %d of the whole dataset(total %d workers).", sharding_index, num_shards) dataset = dataset.shard(num_shards, sharding_index) else: # auto sharding worker_id, num_workers, strategy = get_distributed_worker_setting() if num_workers > 1 and strategy in ["horovod", "byteps" ] and not shuffle and auto_shard: logging.info("Shard %d of the whole dataset(total %d workers).", worker_id, num_workers) options = tf.data.Options() options.experimental_distribute.auto_shard_policy = tf.data.experimental.AutoShardPolicy.OFF dataset = dataset.with_options(options) dataset = dataset.shard(num_workers, worker_id) logging.info("Loading TF Records from: ") for _f in dataset: logging.info(f" {_f.numpy()}") # Read files and interleave results. # When training, the order of the examples will be non-deterministic. options = tf.data.Options() options.experimental_deterministic = deterministic dataset = dataset.interleave( lambda f: tf.data.TFRecordDataset(f, buffer_size=128 * 1024 * 1024), cycle_length=10, num_parallel_calls=tf.data.experimental.AUTOTUNE).with_options(options) if name_to_features is None: return dataset return dataset.map( lambda x: parse_tfexample(x, name_to_features, feature_name_mapping, map_func, auxiliary_elements=auxiliary_elements), num_parallel_calls=tf.data.experimental.AUTOTUNE)
def __init__(self, args, **kwargs): """ Initializes a util class for training neural models. """ super(Trainer, self).__init__(**kwargs) self._tb_log_dir = args["tb_log_dir"] self._train_steps = args["train_steps"] self._summary_steps = args["summary_steps"] self._save_checkpoint_steps = args["save_checkpoint_steps"] self._checkpoints_max_to_keep = args["checkpoints_max_to_keep"] self._initial_global_step = args["initial_global_step"] self._pretrain_variable_pattern = args["pretrain_variable_pattern"] if args["pretrain_model"] and isinstance(args["pretrain_model"][0], dict): self._pretrain_v2 = True self._pretrain_model = args["pretrain_model"] if self._pretrain_variable_pattern: logging.info( "Using pretrain model v2 and ignoring pretrain_variable_pattern: " f"{self._pretrain_variable_pattern}") else: self._pretrain_v2 = False self._pretrain_model = flatten_string_list(args["pretrain_model"]) if args["mask_dir"]: self.mask_dir = args["mask_dir"][0] # print(self.mask_dir) # self.load_mask = np.load(self.mask_dir, allow_pickle=True) with open(self.mask_dir, 'rb') as f: self.load_mask = pickle.load(f) # i = 0 # for weight in self.load_mask: # if i <= 1000: # tf.print(weight.name, output_stream='file://./mask.txt') # if weight.shape.ndims > 0: # tf.print(weight[:1], output_stream='file://./mask.txt', summarize=-1, name=weight.name) # else: # tf.print(weight, output_stream='file://./mask.txt', summarize=-1, name=weight.name) # else: # i += 1 else: self.mask_dir = os.path.join(self.model_dir, "mask.pkl") self.load_mask = None if self._pretrain_model: if self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) elif isinstance(self._pretrain_variable_pattern, str): self._pretrain_variable_pattern = [ self._pretrain_variable_pattern ] assert ( (self._pretrain_model is None and self._pretrain_variable_pattern is None) or len(self._pretrain_model) == len( self._pretrain_variable_pattern) or len(self._pretrain_model) == 1 ), ("`pretrain_variable_pattern` must match with `pretrain_model`." ) if self._pretrain_model is not None and self._pretrain_variable_pattern is None: self._pretrain_variable_pattern = [None] * len( self._pretrain_model) self._update_cycle = args["update_cycle"] self._clip_value = args["clip_value"] self._clip_norm = args["clip_norm"] self._hvd_backend = self.strategy if self.strategy in [ "byteps", "horovod" ] else None with training_utils.get_strategy_scope(self.strategy): self._criterion = build_criterion(args) self._criterion.set_model(self.model) self._lr_schedule_args = args if compat.IS_PREV_TF_2_4_0: self._optimizer = build_optimizer(args) else: self._optimizer = build_optimizer(args, clipnorm=self._clip_norm, clipvalue=self._clip_value) assert self._optimizer is not None, "optimizer parameters must be provided for training." self._validator = build_validator(args) self._experimental_count_batch_num = args[ "experimental_count_batch_num"] self._freeze_variables = args["freeze_variables"] self._pruning_schedule = build_pruning_schedule(args) self._partial_tuning = args["partial_tuning"] self._pruning_variable_pattern = args["pruning_variable_pattern"] self._nopruning_variable_pattern = args["nopruning_variable_pattern"]
def _args_preload_from_config_files(args): cfg_file_args = yaml_load_checking(load_from_config_path( flatten_string_list(getattr(args, DEFAULT_CONFIG_FLAG.name, None)))) return cfg_file_args