def generate_vocab(input, output, min_frequency, max_vocab_size, lowercase=False, extra_slots=None): with tf.io.gfile.GFile(input, "r") as finput: # Counter for all tokens in the vocabulary cnt = collections.Counter() for line in finput: if lowercase: line = line.lower() tokens = line.strip().split() tokens = [_ for _ in tokens if len(_) > 0] cnt.update(tokens) extra_slots = minimal_multiple(len(cnt) + extra_slots + 3, 8) - len(cnt) - 3 extra_slots_list = [] if extra_slots > 0: idx = 0 while len(extra_slots_list) < extra_slots: _txt = ("EXTRA_SLOT%.3d" % idx) idx += 1 if _txt in cnt: continue extra_slots_list.append(_txt) logging.info("Found %d unique tokens in the vocabulary.", len(cnt)) # Filter tokens below the frequency threshold if min_frequency > 0: filtered_tokens = [(w, c) for w, c in cnt.most_common() if c >= min_frequency] cnt = collections.Counter(dict(filtered_tokens)) logging.info("Found %d unique tokens with frequency > %d.", len(cnt), min_frequency) # Sort tokens by 1. frequency 2. lexically to break ties word_with_counts = cnt.most_common() word_with_counts = sorted(word_with_counts, key=lambda x: (x[1], x[0]), reverse=True) # Take only max-vocab if max_vocab_size is not None and max_vocab_size > 0: word_with_counts = word_with_counts[:max_vocab_size] with tf.io.gfile.GFile(output, "w") as foutput: # extra slots for x in extra_slots_list: foutput.write("{}\t{}\n".format(x, 1000)) logging.info("Plus extra %d slots to the vocabulary in the front.", len(extra_slots_list)) for word, count in word_with_counts: foutput.write("{}\t{}\n".format(word, count))
def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) pad = tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64) dataset = ds.build(map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: raise NotImplementedError # logging.info("Creating test dataset.") # return dataset.cache().padded_batch( # dataset_utils.adjust_batch_size(args["batch_size"], # num_replicas_in_sync=num_replicas_in_sync), # padded_shapes={"tokens": [None]}, # padding_values={"tokens": pad}, # drop_remainder=False) elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={"tokens": [None]}, padding_values={"tokens": pad}, drop_remainder=False) else: logging.info("Creating training dataset.") level = args.get("gpu_efficient_level", None) logging.info( f"Creating training dataset with GPU efficient level={level}.") dataset = ds.build( map_func=self.get_data_preprocess_fn(mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=True, shuffle=True) dataset = dataset_utils.clean_dataset_by_length( dataset, {"tokens": args["max_len"]}) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = { "tokens": tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64) } if args["max_len"] is None: raise RuntimeError("Must provide `max_len` for training.") max_len = minimal_multiple(args["max_len"], EFFICIENT_MULTIPLIER[level]) batch_size = dataset_utils.adjust_batch_size( args["batch_size"], args["batch_size_per_gpu"], num_replicas_in_sync=num_replicas_in_sync, verbose=False) if level == GPU_EFFICIENT_LEVEL.LEVEL5: # static batch _batch_size = batch_size if args["batch_by_tokens"]: _batch_size = _batch_size // max_len logging.info( f"Batching dataset with fixed shape: batch={_batch_size} x {max_len})." ) return dataset.padded_batch( _batch_size // num_replicas_in_sync * num_replicas_in_sync, padded_shapes={"tokens": [max_len]}, padding_values=padding_values, drop_remainder=True) else: bucket_boundaries = [ EFFICIENT_MULTIPLIER[level] * i for i in range(1, max_len // EFFICIENT_MULTIPLIER[level] + 1) ] if bucket_boundaries[-1] < max_len: bucket_boundaries.append( minimal_multiple(bucket_boundaries[-1] + 1, EFFICIENT_MULTIPLIER[level])) bucket_boundaries = {"tokens": bucket_boundaries} bucket_batch_sizes = dataset_utils.adjust_batch_size( batch_size, bucket_boundaries=bucket_boundaries if args["batch_by_tokens"] else None, boundaries_reduce_to_length_fn=lambda x: max( tf.nest.flatten(x)), num_replicas_in_sync=num_replicas_in_sync) if level != GPU_EFFICIENT_LEVEL.LEVEL0: if isinstance(bucket_batch_sizes, list): bucket_batch_sizes = [ int( maximum_lower_multiple( x // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) for x in bucket_batch_sizes ] else: bucket_batch_sizes = int( maximum_lower_multiple( bucket_batch_sizes // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) return dataset_utils.batch_examples_by_token( dataset, bucket_boundaries=bucket_boundaries, bucket_batch_sizes=bucket_batch_sizes, padding_values=padding_values, example_length_func=lambda x: {k: tf.size(v) for k, v in x.items()})
def create_and_batch_tfds(self, ds, mode, args=None, num_replicas_in_sync=1): """ With efficient level for training. """ if mode in [compat.ModeKeys.INFER, compat.ModeKeys.EVAL]: return super(Translation, self).create_and_batch_tfds( ds, mode, args, num_replicas_in_sync) if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) level = args.get("gpu_efficient_level", None) auto_scale_batch = args.get("auto_scaling_batch_size", None) logging.info(f"Creating training dataset with GPU efficient level={level}.") dataset = ds.build(map_func=self.get_data_preprocess_fn(mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=True, shuffle=True) dataset = dataset_utils.clean_dataset_by_length( dataset, {"feature": args["max_src_len"], "label": args["max_trg_len"]}) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = {"feature": tf.constant(self._src_data_pipeline.meta["eos_id"], dtype=tf.int64), "label": tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64)} if args["max_src_len"] is None: raise RuntimeError("Must provide `max_src_len` for training.") if args["max_trg_len"] is None: raise RuntimeError("Must provide `max_trg_len` for training.") max_src_len = minimal_multiple(args["max_src_len"], EFFICIENT_MULTIPLIER[level]) max_trg_len = minimal_multiple(args["max_trg_len"], EFFICIENT_MULTIPLIER[level]) max_len = max(max_src_len, max_trg_len) batch_size = dataset_utils.adjust_batch_size(args["batch_size"], args["batch_size_per_gpu"], num_replicas_in_sync=num_replicas_in_sync, verbose=False) if auto_scale_batch: batch_size = _auto_scale_batch_size(batch_size, level) logging.info(f"Auto scaling batch size to {batch_size}.") if level == GPU_EFFICIENT_LEVEL.LEVEL5: # static batch _batch_size = batch_size if args["batch_by_tokens"]: _batch_size = _batch_size // max_len logging.info("Batching dataset with fixed shape: " f"batch={_batch_size} x (feature={max_src_len}, label={max_trg_len}).") return dataset.padded_batch( _batch_size // num_replicas_in_sync * num_replicas_in_sync, padded_shapes={"feature": [max_src_len], "label": [max_trg_len]}, drop_remainder=True, padding_values=padding_values) else: src_bucket_boundaries = [EFFICIENT_MULTIPLIER[level] * i for i in range(1, max_src_len // EFFICIENT_MULTIPLIER[level] + 1)] if src_bucket_boundaries[-1] < max_src_len: src_bucket_boundaries.append(minimal_multiple(src_bucket_boundaries[-1] + 1, EFFICIENT_MULTIPLIER[level])) trg_bucket_boundaries = [EFFICIENT_MULTIPLIER[level] * i for i in range(1, max_trg_len // EFFICIENT_MULTIPLIER[level] + 1)] if trg_bucket_boundaries[-1] < max_trg_len: trg_bucket_boundaries.append(minimal_multiple(trg_bucket_boundaries[-1] + 1, EFFICIENT_MULTIPLIER[level])) src_bucket_boundaries, trg_bucket_boundaries = dataset_utils.associated_bucket_boundaries( src_bucket_boundaries, trg_bucket_boundaries) bucket_boundaries = { "feature": src_bucket_boundaries, "label": trg_bucket_boundaries } bucket_batch_sizes = dataset_utils.adjust_batch_size( batch_size, bucket_boundaries=bucket_boundaries if args["batch_by_tokens"] else None, boundaries_reduce_to_length_fn=lambda x: max(tf.nest.flatten(x)), num_replicas_in_sync=num_replicas_in_sync) if level != GPU_EFFICIENT_LEVEL.LEVEL0: if isinstance(bucket_batch_sizes, list): bucket_batch_sizes = [ int(maximum_lower_multiple(x // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) for x in bucket_batch_sizes] else: bucket_batch_sizes = int(maximum_lower_multiple( bucket_batch_sizes // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) return dataset_utils.batch_examples_by_token( dataset, bucket_boundaries=bucket_boundaries, bucket_batch_sizes=bucket_batch_sizes, padding_values=padding_values, example_length_func=lambda x: {k: tf.size(v) for k, v in x.items()} )
def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) float_zero = tf.constant(0, dtype=tf.float32) int_zero = tf.constant(0, dtype=tf.int64) trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64) dataset = ds.build(map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: logging.info("Creating test dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={ "audio": [None], "audio_length": [] }, padding_values={ "audio": float_zero, "audio_length": int_zero }, drop_remainder=False) elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={ "audio": [None], "audio_length": [], "transcript": [None] }, padding_values={ "audio": float_zero, "audio_length": int_zero, "transcript": trg_eos }, drop_remainder=False) else: logging.info("Creating training dataset.") dataset = dataset_utils.clean_dataset_by_length( dataset, { "audio": args["max_src_len"] * self._audio_feature_dim * self._audio_feature_channels, "audio_length": -1, "transcript": args["max_trg_len"] }) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = { "audio": float_zero, "audio_length": int_zero, "transcript": trg_eos } if args["max_src_len"] is None: raise RuntimeError( "`max_src_len` for SpeechToText task must be provided.") if args["max_trg_len"] is None: raise RuntimeError( "`max_trg_len` for SpeechToText task must be provided.") max_src_len = args["max_src_len"] max_trg_len = minimal_multiple(args["max_trg_len"], 8) audio_bucket_boundaries = create_audio_bucket_boundaries( max_src_len, args["min_src_bucket_boundary"]) audio_bucket_boundaries[-1] = minimal_multiple( audio_bucket_boundaries[-1], 8) batch_size = dataset_utils.adjust_batch_size( args["batch_size"], args["batch_size_per_gpu"], num_replicas_in_sync=num_replicas_in_sync, verbose=False) batch_size_per_gpu = batch_size // num_replicas_in_sync assert batch_size_per_gpu > max_src_len, ( f"batch size per gpu({batch_size_per_gpu} must be greater than " f"`max_src_len`={max_src_len}") if args["disable_batch_efficiency"]: bucket_batch_sizes = [ int(batch_size_per_gpu // bound * num_replicas_in_sync) for bound in audio_bucket_boundaries ] else: bucket_batch_sizes = [ int( minimal_multiple(batch_size_per_gpu // bound, 8) * num_replicas_in_sync) for bound in audio_bucket_boundaries ] frame_transcript_ratio = args[ "experimental_frame_transcript_ratio"] if frame_transcript_ratio is None: logging.info( "WARNING: we recommend one to pre-scan the dataset and estimate the ratio: " "frame length / transcript length.") else: trans_bucket_boundaries = [ int(bound / (frame_transcript_ratio + i * (max_src_len / max_trg_len - frame_transcript_ratio) / len(audio_bucket_boundaries))) for i, bound in enumerate(audio_bucket_boundaries) ] trans_bucket_boundaries = [ minimal_multiple(min(i, max_trg_len), 8) for i in trans_bucket_boundaries ] num_buckets = len(trans_bucket_boundaries) true_trans_bucket_boundaries = [] num_input_shapes = 0 for idx, (batc, bound, tbound) in enumerate( zip(bucket_batch_sizes, audio_bucket_boundaries, trans_bucket_boundaries)): max_trans_len = [ tbound, trans_bucket_boundaries[min( idx + 1, len(bucket_batch_sizes) - 1)] ] num_input_shapes += len(set(max_trans_len)) true_trans_bucket_boundaries.append(max_trans_len) logging.info( f"There are {num_input_shapes} input shapes to be compiled:" ) for idx, (batc, bound, tbound) in enumerate( zip(bucket_batch_sizes, audio_bucket_boundaries, true_trans_bucket_boundaries)): logging.info(f" - batch={batc}, maximum-frames={bound}, " f"maximum-transcript-length={set(tbound)}") true_trans_bucket_boundaries = tf.constant( true_trans_bucket_boundaries, dtype=tf.int32) true_audio_bucket_boundaries = tf.transpose( tf.constant([audio_bucket_boundaries] * 2, dtype=tf.int32)) bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) audio_bucket_boundaries = tf.constant(audio_bucket_boundaries, dtype=tf.int32) def example_to_bucket_id(examples): """Return int64 bucket id for this example, calculated based on length.""" if frame_transcript_ratio is None: conditions_c = tf.less_equal( tf.cast(examples["audio_length"], tf.int32), audio_bucket_boundaries) return tf.reduce_min(tf.where(conditions_c)) conditions_c = tf.logical_and( tf.less_equal(tf.cast(examples["audio_length"], tf.int32), true_audio_bucket_boundaries), tf.less_equal(tf.size(examples["transcript"]), true_trans_bucket_boundaries)) minimum_match = tf.where(conditions_c)[0] return minimum_match[0] * num_buckets + minimum_match[1] def window_size_fn(bucket_id): """Return number of examples to be grouped when given a bucket id.""" if frame_transcript_ratio is None: return bucket_batch_sizes[bucket_id] return bucket_batch_sizes[bucket_id // num_buckets] def batching_fn(bucket_id, grouped_dataset): """Batch and add padding to a dataset of elements with similar lengths.""" bucket_batch_size = window_size_fn(bucket_id) # Batch the dataset and add padding so that all input sequences in the # examples have the same length, and all target sequences have the same # lengths as well. Resulting lengths of inputs and targets can differ. return grouped_dataset.padded_batch( bucket_batch_size, padded_shapes={ "audio": ([(audio_bucket_boundaries[bucket_id] if frame_transcript_ratio is None else audio_bucket_boundaries[bucket_id // num_buckets]) * self._audio_feature_dim * self._audio_feature_channels]), "audio_length": [], "transcript": ([None] if frame_transcript_ratio is None else [ true_trans_bucket_boundaries[ bucket_id // num_buckets][bucket_id % num_buckets] ]) }, padding_values=padding_values, drop_remainder=True) return dataset.apply( tf.data.experimental.group_by_window( key_func=example_to_bucket_id, reduce_func=batching_fn, window_size=None, window_size_func=window_size_fn))
def create_and_batch_tfds(self, ds, mode, args=None, num_replicas_in_sync=1): """ With efficient level for training. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args) level = args.get("gpu_efficient_level", None) auto_scale_batch = args.get("auto_scaling_batch_size", None) if (mode in [compat.ModeKeys.INFER, compat.ModeKeys.EVAL] or level is None or level == GPU_EFFICIENT_LEVEL.LEVEL0): return super(Translation, self).create_and_batch_tfds(ds, mode, args, num_replicas_in_sync) padding_values = { "feature": tf.constant(self._src_data_pipeline.meta["eos_id"], dtype=tf.int64), "label": tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64) } dataset = ds.build(auto_shard=True, map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0]) max_src_len = args["max_src_len"] max_trg_len = args["max_trg_len"] batch_by_tokens = args["batch_by_tokens"] assert max_src_len, "Must provide `max_src_len` when `gpu_efficient_level` > 0" assert max_trg_len, "Must provide `max_trg_len` when `gpu_efficient_level` > 0" logging.info( f"Creating training dataset with `gpu_efficient_level`={level}.") dataset = clean_dataset_by_length(dataset, { "feature": max_src_len, "label": max_trg_len }) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) batch_size_per_gpu = args["batch_size_per_gpu"] batch_size = args["batch_size"] if batch_size_per_gpu is None: batch_size_per_gpu = batch_size // num_replicas_in_sync if batch_by_tokens: assert batch_size_per_gpu > max(max_src_len, max_trg_len), ( f"batch size per gpu({batch_size_per_gpu} must be greater than " f"both `max_src_len`{max_src_len} and `max_trg_len`{max_trg_len}" ) if auto_scale_batch: new_batch_size_per_gpu = _auto_scale_batch_size( batch_size_per_gpu, level) logging.info( f"Auto scaling `batch_size_per_gpu` from {batch_size_per_gpu} " f"to {new_batch_size_per_gpu}") batch_size_per_gpu = new_batch_size_per_gpu max_src_len = minimal_multiple(max_src_len, EFFICIENT_MULTIPLIER[level]) max_trg_len = minimal_multiple(max_trg_len, EFFICIENT_MULTIPLIER[level]) max_len = max(max_src_len, max_trg_len) if level == GPU_EFFICIENT_LEVEL.LEVEL5: # static batch if batch_by_tokens: batch_size_per_gpu = batch_size_per_gpu // max_len return dataset.padded_batch(int( minimal_multiple(batch_size_per_gpu, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync), padded_shapes={ "feature": [max_src_len], "label": [max_trg_len] }, drop_remainder=True, padding_values=padding_values) else: bucket_boundaries = [ EFFICIENT_MULTIPLIER[level] * i for i in range(1, max_len // EFFICIENT_MULTIPLIER[level] + 1) ] if bucket_boundaries[-1] < max_len: bucket_boundaries.append( minimal_multiple(bucket_boundaries[-1] + 1, EFFICIENT_MULTIPLIER[level])) buckets_min = [0] + bucket_boundaries[:-1] if batch_by_tokens: bucket_batch_sizes = [ int( minimal_multiple(batch_size_per_gpu // bound, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) for bound in bucket_boundaries ] else: bucket_batch_sizes = [ int( minimal_multiple(batch_size_per_gpu, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) ] * len(bucket_boundaries) logging.info( f"There are {len(bucket_batch_sizes)} input shapes to be compiled:" ) for batc, bound in zip(bucket_batch_sizes, bucket_boundaries): logging.info(f" - batch={batc}, maximum-length={bound}") bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) bucket_boundaries = tf.constant(bucket_boundaries, dtype=tf.int32) def example_to_bucket_id(examples): """Return int64 bucket id for this example, calculated based on length.""" seq_length = tf.cast( tf.maximum(tf.size(examples["feature"]), tf.size(examples["label"])), tf.int32) conditions_c = tf.logical_and( tf.less(buckets_min, seq_length), tf.less_equal(seq_length, bucket_boundaries)) bucket_id = tf.reduce_min(tf.where(conditions_c)) return bucket_id def window_size_fn(bucket_id): """Return number of examples to be grouped when given a bucket id.""" return bucket_batch_sizes[bucket_id] def batching_fn(bucket_id, grouped_dataset): """Batch and add padding to a dataset of elements with similar lengths.""" bucket_batch_size = window_size_fn(bucket_id) # Batch the dataset and add padding so that all input sequences in the # examples have the same length, and all target sequences have the same # lengths as well. Resulting lengths of inputs and targets can differ. return grouped_dataset.padded_batch( bucket_batch_size, padded_shapes={ "feature": [bucket_boundaries[bucket_id]], "label": [bucket_boundaries[bucket_id]] }, padding_values=padding_values, drop_remainder=True) return dataset.apply( tf.data.experimental.group_by_window( key_func=example_to_bucket_id, reduce_func=batching_fn, window_size=None, window_size_func=window_size_fn))