def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset or a INFER_DATA tuple. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args) src_eos = tf.constant(self._src_data_pipeline.meta["eos_id"], dtype=tf.int64) trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64) assert isinstance(ds, AbstractParallelDataset), ( "The dataset for SeqToSeq task must inherit AbstractParallelDataset.") dataset = ds.build(map_func=self.get_data_preprocess_fn(mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: logging.info("Creating test dataset.") test_dataset = dataset_utils.batch_sequential_dataset( dataset=dataset.cache(), batch_size=args["batch_size"], padding_values={"feature": src_eos}, num_replicas_in_sync=num_replicas_in_sync, drop_remainder=False) return test_dataset elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset_utils.batch_sequential_dataset( dataset.cache(), batch_size=args["batch_size"], padding_values={"feature": src_eos, "label": trg_eos}, num_replicas_in_sync=num_replicas_in_sync, drop_remainder=False) else: logging.info("Creating training dataset.") if args["cache_dataset"]: dataset = dataset.cache() dataset = dataset_utils.batch_sequential_dataset( dataset, padding_values={"feature": src_eos, "label": trg_eos}, batch_size=args["batch_size"], batch_size_per_gpu=args["batch_size_per_gpu"], batch_by_tokens=args["batch_by_tokens"], shuffer_buffer=args["shuffle_buffer"], data_max_lengths={"feature": args["max_src_len"], "label": args["max_trg_len"]}, drop_remainder=True, num_replicas_in_sync=num_replicas_in_sync) return dataset
def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) pad = tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64) dataset = ds.build(map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: raise NotImplementedError # logging.info("Creating test dataset.") # return dataset.cache().padded_batch( # dataset_utils.adjust_batch_size(args["batch_size"], # num_replicas_in_sync=num_replicas_in_sync), # padded_shapes={"tokens": [None]}, # padding_values={"tokens": pad}, # drop_remainder=False) elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={"tokens": [None]}, padding_values={"tokens": pad}, drop_remainder=False) else: logging.info("Creating training dataset.") level = args.get("gpu_efficient_level", None) logging.info( f"Creating training dataset with GPU efficient level={level}.") dataset = ds.build( map_func=self.get_data_preprocess_fn(mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=True, shuffle=True) dataset = dataset_utils.clean_dataset_by_length( dataset, {"tokens": args["max_len"]}) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = { "tokens": tf.constant(self._data_pipeline.meta["pad_id"], dtype=tf.int64) } if args["max_len"] is None: raise RuntimeError("Must provide `max_len` for training.") max_len = minimal_multiple(args["max_len"], EFFICIENT_MULTIPLIER[level]) batch_size = dataset_utils.adjust_batch_size( args["batch_size"], args["batch_size_per_gpu"], num_replicas_in_sync=num_replicas_in_sync, verbose=False) if level == GPU_EFFICIENT_LEVEL.LEVEL5: # static batch _batch_size = batch_size if args["batch_by_tokens"]: _batch_size = _batch_size // max_len logging.info( f"Batching dataset with fixed shape: batch={_batch_size} x {max_len})." ) return dataset.padded_batch( _batch_size // num_replicas_in_sync * num_replicas_in_sync, padded_shapes={"tokens": [max_len]}, padding_values=padding_values, drop_remainder=True) else: bucket_boundaries = [ EFFICIENT_MULTIPLIER[level] * i for i in range(1, max_len // EFFICIENT_MULTIPLIER[level] + 1) ] if bucket_boundaries[-1] < max_len: bucket_boundaries.append( minimal_multiple(bucket_boundaries[-1] + 1, EFFICIENT_MULTIPLIER[level])) bucket_boundaries = {"tokens": bucket_boundaries} bucket_batch_sizes = dataset_utils.adjust_batch_size( batch_size, bucket_boundaries=bucket_boundaries if args["batch_by_tokens"] else None, boundaries_reduce_to_length_fn=lambda x: max( tf.nest.flatten(x)), num_replicas_in_sync=num_replicas_in_sync) if level != GPU_EFFICIENT_LEVEL.LEVEL0: if isinstance(bucket_batch_sizes, list): bucket_batch_sizes = [ int( maximum_lower_multiple( x // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) for x in bucket_batch_sizes ] else: bucket_batch_sizes = int( maximum_lower_multiple( bucket_batch_sizes // num_replicas_in_sync, EFFICIENT_MULTIPLIER[level]) * num_replicas_in_sync) return dataset_utils.batch_examples_by_token( dataset, bucket_boundaries=bucket_boundaries, bucket_batch_sizes=bucket_batch_sizes, padding_values=padding_values, example_length_func=lambda x: {k: tf.size(v) for k, v in x.items()})
def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) src_eos = tf.constant(self._src_data_pipeline.meta["eos_id"], dtype=tf.int64) trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64) assert isinstance(ds, AbstractParallelDataset), ( "The dataset for SeqToSeq task must inherit AbstractParallelDataset." ) dataset = ds.build(map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: logging.info("Creating test dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={"feature": [None]}, padding_values={"feature": src_eos}, drop_remainder=False) elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={ "feature": [None], "label": [None] }, padding_values={ "feature": src_eos, "label": trg_eos }, drop_remainder=False) else: logging.info("Creating training dataset.") dataset = dataset_utils.clean_dataset_by_length( dataset, { "feature": args["max_src_len"], "label": args["max_trg_len"] }) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = {"feature": src_eos, "label": trg_eos} if args["max_src_len"] is None: raise RuntimeError("Must provide `max_src_len` for training.") if args["max_trg_len"] is None: raise RuntimeError("Must provide `max_trg_len` for training.") src_bucket_boundaries, trg_bucket_boundaries = dataset_utils.associated_bucket_boundaries( dataset_utils.create_batch_bucket_boundaries( args["max_src_len"]), dataset_utils.create_batch_bucket_boundaries( args["max_trg_len"])) bucket_boundaries = { "feature": src_bucket_boundaries, "label": trg_bucket_boundaries } bucket_batch_sizes = dataset_utils.adjust_batch_size( args["batch_size"], args["batch_size_per_gpu"], bucket_boundaries=bucket_boundaries if args["batch_by_tokens"] else None, boundaries_reduce_to_length_fn=lambda x: max(tf.nest.flatten(x) ), num_replicas_in_sync=num_replicas_in_sync) return dataset_utils.batch_examples_by_token( dataset, bucket_boundaries=bucket_boundaries, bucket_batch_sizes=bucket_batch_sizes, padding_values=padding_values, example_length_func=lambda x: {k: tf.size(v) for k, v in x.items()})
def main(processor_id, num_processors, num_output_shards, output_range_begin, output_range_end, output_template, dataset: Dataset, progressbar=False, task=None): assert 0 <= output_range_begin < output_range_end <= num_output_shards assert 0 <= processor_id < num_processors logging.info(f"Shards: {output_range_begin} to {output_range_end}") if not tf.io.gfile.exists(os.path.dirname(output_template)): tf.io.gfile.makedirs(os.path.dirname(output_template)) file_paths = [ output_template % (s, num_output_shards) for s in range(output_range_begin, output_range_end) ] tmp_file_paths = [f + ".incomplete" for f in file_paths] recordio_writers = [tf.io.TFRecordWriter(_x) for _x in tmp_file_paths] map_func = None if task is not None: map_func = task.get_data_preprocess_fn(ModeKeys.TRAIN, dataset.status) feature_type_dict = None i = 0 if progressbar: from tqdm import tqdm iterator = tqdm(dataset.build_iterator(map_func=map_func, shard_id=processor_id, total_shards=num_processors)(), total=dataset.num_samples // num_processors) else: iterator = dataset.build_iterator(map_func=map_func, shard_id=processor_id, total_shards=num_processors)() for example in iterator: # lazily pre-process if feature_type_dict is None: feature_type_dict = dict() for name, data in example.items(): data_type = type(numpy.array(data).flatten().tolist()[0]) assert data_type in [int, float, str, bytes ], "Not supported {}".format(data_type) feature_type_dict[name] = data_type feature_dict = {} for name, data in example.items(): feature_dict[name] = _format_tf_feature(data, feature_type_dict[name]) recordio_writers[random.randint( 0, len(recordio_writers) - 1)].write( tf.train.Example(features=tf.train.Features( feature=feature_dict)).SerializeToString()) i += 1 logging.info(f"Total processed {i} samples.") for recordio_writer in recordio_writers: recordio_writer.close() for tmp_f, f in zip(tmp_file_paths, file_paths): tf.io.gfile.rename(tmp_f, f, overwrite=True) logging.info( "===================== Examine feature types =====================") for x in tf.data.TFRecordDataset(file_paths).take(1): example = tf.train.Example() example.ParseFromString(x.numpy()) logging.info("{") for name in example.features.feature: if len(example.features.feature[name].bytes_list.value) > 0: logging.info(f" \"{name}\": bytes (str)") elif len(example.features.feature[name].int64_list.value) > 0: logging.info(f" \"{name}\": int64") elif len(example.features.feature[name].float_list.value) > 0: logging.info(f" \"{name}\": float32") logging.info("}")
def create_and_batch_tfds(self, ds: Dataset, mode, args=None, num_replicas_in_sync=1) -> tf.data.Dataset: """ Creates a dataset according to the `mode`. Args: args: A dict containing dataset arguments. ds: A neurst.data.datasets.Dataset object. mode: A ModeKeys indicating the running mode. num_replicas_in_sync: The number of GPUs or other workers. We will generate global batches, and each global batch is equally divisible by number of replicas. Returns: A tf.data.Dataset. """ if args is None: args = self._args else: args = deep_merge_dict(self._args, args, local_overwrite=False) float_zero = tf.constant(0, dtype=tf.float32) int_zero = tf.constant(0, dtype=tf.int64) trg_eos = tf.constant(self._trg_data_pipeline.meta["eos_id"], dtype=tf.int64) dataset = ds.build(map_func=self.get_data_preprocess_fn( mode, ds.status, args), map_output_dtypes=self.inputs_signature(mode)[0], auto_shard=(mode == compat.ModeKeys.TRAIN), shuffle=(mode == compat.ModeKeys.TRAIN)) if mode == compat.ModeKeys.INFER: logging.info("Creating test dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={ "audio": [None], "audio_length": [] }, padding_values={ "audio": float_zero, "audio_length": int_zero }, drop_remainder=False) elif mode == compat.ModeKeys.EVAL: logging.info("Creating evaluation dataset.") return dataset.cache().padded_batch( dataset_utils.adjust_batch_size( args["batch_size"], num_replicas_in_sync=num_replicas_in_sync), padded_shapes={ "audio": [None], "audio_length": [], "transcript": [None] }, padding_values={ "audio": float_zero, "audio_length": int_zero, "transcript": trg_eos }, drop_remainder=False) else: logging.info("Creating training dataset.") dataset = dataset_utils.clean_dataset_by_length( dataset, { "audio": args["max_src_len"] * self._audio_feature_dim * self._audio_feature_channels, "audio_length": -1, "transcript": args["max_trg_len"] }) if args["cache_dataset"]: dataset = dataset.cache() if args["shuffle_buffer"]: dataset = dataset.shuffle(buffer_size=args["shuffle_buffer"]) padding_values = { "audio": float_zero, "audio_length": int_zero, "transcript": trg_eos } if args["max_src_len"] is None: raise RuntimeError( "`max_src_len` for SpeechToText task must be provided.") if args["max_trg_len"] is None: raise RuntimeError( "`max_trg_len` for SpeechToText task must be provided.") max_src_len = args["max_src_len"] max_trg_len = minimal_multiple(args["max_trg_len"], 8) audio_bucket_boundaries = create_audio_bucket_boundaries( max_src_len, args["min_src_bucket_boundary"]) audio_bucket_boundaries[-1] = minimal_multiple( audio_bucket_boundaries[-1], 8) batch_size = dataset_utils.adjust_batch_size( args["batch_size"], args["batch_size_per_gpu"], num_replicas_in_sync=num_replicas_in_sync, verbose=False) batch_size_per_gpu = batch_size // num_replicas_in_sync assert batch_size_per_gpu > max_src_len, ( f"batch size per gpu({batch_size_per_gpu} must be greater than " f"`max_src_len`={max_src_len}") if args["disable_batch_efficiency"]: bucket_batch_sizes = [ int(batch_size_per_gpu // bound * num_replicas_in_sync) for bound in audio_bucket_boundaries ] else: bucket_batch_sizes = [ int( minimal_multiple(batch_size_per_gpu // bound, 8) * num_replicas_in_sync) for bound in audio_bucket_boundaries ] frame_transcript_ratio = args[ "experimental_frame_transcript_ratio"] if frame_transcript_ratio is None: logging.info( "WARNING: we recommend one to pre-scan the dataset and estimate the ratio: " "frame length / transcript length.") else: trans_bucket_boundaries = [ int(bound / (frame_transcript_ratio + i * (max_src_len / max_trg_len - frame_transcript_ratio) / len(audio_bucket_boundaries))) for i, bound in enumerate(audio_bucket_boundaries) ] trans_bucket_boundaries = [ minimal_multiple(min(i, max_trg_len), 8) for i in trans_bucket_boundaries ] num_buckets = len(trans_bucket_boundaries) true_trans_bucket_boundaries = [] num_input_shapes = 0 for idx, (batc, bound, tbound) in enumerate( zip(bucket_batch_sizes, audio_bucket_boundaries, trans_bucket_boundaries)): max_trans_len = [ tbound, trans_bucket_boundaries[min( idx + 1, len(bucket_batch_sizes) - 1)] ] num_input_shapes += len(set(max_trans_len)) true_trans_bucket_boundaries.append(max_trans_len) logging.info( f"There are {num_input_shapes} input shapes to be compiled:" ) for idx, (batc, bound, tbound) in enumerate( zip(bucket_batch_sizes, audio_bucket_boundaries, true_trans_bucket_boundaries)): logging.info(f" - batch={batc}, maximum-frames={bound}, " f"maximum-transcript-length={set(tbound)}") true_trans_bucket_boundaries = tf.constant( true_trans_bucket_boundaries, dtype=tf.int32) true_audio_bucket_boundaries = tf.transpose( tf.constant([audio_bucket_boundaries] * 2, dtype=tf.int32)) bucket_batch_sizes = tf.constant(bucket_batch_sizes, dtype=tf.int64) audio_bucket_boundaries = tf.constant(audio_bucket_boundaries, dtype=tf.int32) def example_to_bucket_id(examples): """Return int64 bucket id for this example, calculated based on length.""" if frame_transcript_ratio is None: conditions_c = tf.less_equal( tf.cast(examples["audio_length"], tf.int32), audio_bucket_boundaries) return tf.reduce_min(tf.where(conditions_c)) conditions_c = tf.logical_and( tf.less_equal(tf.cast(examples["audio_length"], tf.int32), true_audio_bucket_boundaries), tf.less_equal(tf.size(examples["transcript"]), true_trans_bucket_boundaries)) minimum_match = tf.where(conditions_c)[0] return minimum_match[0] * num_buckets + minimum_match[1] def window_size_fn(bucket_id): """Return number of examples to be grouped when given a bucket id.""" if frame_transcript_ratio is None: return bucket_batch_sizes[bucket_id] return bucket_batch_sizes[bucket_id // num_buckets] def batching_fn(bucket_id, grouped_dataset): """Batch and add padding to a dataset of elements with similar lengths.""" bucket_batch_size = window_size_fn(bucket_id) # Batch the dataset and add padding so that all input sequences in the # examples have the same length, and all target sequences have the same # lengths as well. Resulting lengths of inputs and targets can differ. return grouped_dataset.padded_batch( bucket_batch_size, padded_shapes={ "audio": ([(audio_bucket_boundaries[bucket_id] if frame_transcript_ratio is None else audio_bucket_boundaries[bucket_id // num_buckets]) * self._audio_feature_dim * self._audio_feature_channels]), "audio_length": [], "transcript": ([None] if frame_transcript_ratio is None else [ true_trans_bucket_boundaries[ bucket_id // num_buckets][bucket_id % num_buckets] ]) }, padding_values=padding_values, drop_remainder=True) return dataset.apply( tf.data.experimental.group_by_window( key_func=example_to_bucket_id, reduce_func=batching_fn, window_size=None, window_size_func=window_size_fn))