def read_input_glob_and_sample_rate_from_flags(input_glob_flag, sample_rate_flag, tfds_dataset_flag, output_filename_flag, tfds_data_dir_flag): """Read flags for input data and sample rate. Args: input_glob_flag: String flag. The input file glob. sample_rate_flag: String flag. The sample rate. tfds_dataset_flag: String flag. The TFDS dataset. output_filename_flag: String flag. The output filename. tfds_data_dir_flag: String flag. Optional location of local TFDS data. Returns: (input_filenames, output_filenames, sample_rate) `input_filenames` is a list of list of filenames. `output_filenames` is a list of the same length. """ if input_glob_flag: assert file_utils.Glob(input_glob_flag), input_glob_flag assert not tfds_data_dir_flag input_filenames = [file_utils.Glob(input_glob_flag)] output_filenames = [output_filename_flag] sample_rate = sample_rate_flag else: assert tfds_dataset_flag dataset_name = tfds_dataset_flag # Download dataset, if necessary. tfds.load(dataset_name, data_dir=tfds_data_dir_flag) sample_rate = _tfds_sample_rate(dataset_name, tfds_data_dir_flag) assert sample_rate, sample_rate input_filenames = [] output_filenames = [] for split_name in ('train', 'validation', 'test'): input_filenames.append( _tfds_filenames(dataset_name, split_name, tfds_data_dir_flag)) output_filenames.append(output_filename_flag + f'.{split_name}') logging.info('TFDS input filenames: %s', input_filenames) logging.info('sample rate: %s', sample_rate) if sample_rate: assert isinstance(sample_rate, numbers.Number) for filename_list in input_filenames: for filename in filename_list: assert tf.io.gfile.exists(filename), filename assert len(input_filenames) == len(output_filenames) return input_filenames, output_filenames, sample_rate
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): def _params_dict( l2_normalization, speaker_id_name=FLAGS.speaker_id_name, elem=elem): return { 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, 'l2_normalization': l2_normalization, 'speaker_id_name': speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'eval_metric': FLAGS.eval_metric, } exp_params.append(_params_dict(l2_normalization=True)) exp_params.append(_params_dict(l2_normalization=False)) if FLAGS.speaker_id_name is not None: exp_params.append( _params_dict(l2_normalization=True, speaker_id_name=None)) exp_params.append( _params_dict(l2_normalization=False, speaker_id_name=None)) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText( FLAGS.output_file, num_shards=1) )
def validate_inputs( input_filenames_list, output_filenames, embedding_modules, embedding_names, module_output_keys): """Validate inputs and input flags.""" for filename_list in input_filenames_list: for filename in filename_list: assert tf.io.gfile.exists(filename), filename assert len(input_filenames_list) == len(output_filenames) # Make sure output files don't already exist. for output_filename in output_filenames: assert not file_utils.Glob(f'{output_filename}*'), output_filename # Lengths of flag lists must be the same. assert len(embedding_names) == len(embedding_modules),\ (embedding_names, embedding_modules) assert len(embedding_modules) == len(module_output_keys),\ (embedding_modules, module_output_keys) # Shortnames must be unique. assert len(set(embedding_names)) == len(embedding_names), embedding_names # Create output directory if it doesn't already exist. for output_filename in output_filenames: output_dir = output_filename.rsplit('/', 1)[0] file_utils.MaybeMakeDirs(output_dir)
def get_data(file_pattern, teacher_fn, output_dimension, reader, samples_key, min_length, batch_size, loop_forever, shuffle, shuffle_buffer_size=10000): """Gets the data for TRILL distillation. This function is *always* stochastic. Args: file_pattern: Glob for input data. teacher_fn: A function that takes 1 argument and returns label embeddings. output_dimension: Feature dimension of teacher output. reader: Class used to parse data on disk. samples_key: Name of audio samples in tf.Examples min_length: The minimum audio length. Should take sample rate into account. Examples smaller than this are dropped. Examples longer than this are randomly cropped to this size. batch_size: Batch size of data in returned tf.data.Dataset. loop_forever: Python bool. Whether to loop forever. shuffle: Python bool. Whether to shuffle data. shuffle_buffer_size: Size of shuffle buffer Returns: A tf.data.Dataset of (audio samples, regression targets). """ assert file_utils.Glob(file_pattern), file_pattern assert callable(teacher_fn) # Audio samples are variable length. features = { samples_key: tf.io.VarLenFeature(dtype=tf.float32), } # Load data into a dataset of batch size 1. Then preprocess. ds = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=1, num_epochs=None if loop_forever else 1, reader_num_threads=tf.data.experimental.AUTOTUNE, parser_num_threads=tf.data.experimental.AUTOTUNE, features=features, reader=reader, shuffle=shuffle, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=tf.data.experimental.AUTOTUNE, sloppy_ordering=True) ds = tf_data_pipeline(ds, teacher_fn, samples_key, min_length, batch_size, output_dimension) return ds
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, 'save_model_dir': FLAGS.save_model_dir, 'calculate_equal_error_rate': FLAGS.calculate_equal_error_rate, }) # Make and run beam pipeline. beam_options = None logging.info('Starting to create flume pipeline...') with beam.Pipeline(beam_options) as root: _ = (root | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'Reshuffle' >> beam.Reshuffle() | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1) )
def main(unused_argv): assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob # Create output directory if it doesn't already exist. outdir = os.path.dirname(FLAGS.output_file) file_utils.MaybeMakeDirs(outdir) # Enumerate the configurations we want to run. exp_params = [] model_names = models.get_sklearn_models().keys() for elem in itertools.product(*[FLAGS.embedding_list, model_names]): exp_params.append({ 'embedding_name': elem[0], 'model_name': elem[1], 'label_name': FLAGS.label_name, 'label_list': FLAGS.label_list, 'train_glob': FLAGS.train_glob, 'eval_glob': FLAGS.eval_glob, 'test_glob': FLAGS.test_glob, # Either L2 normalization or speaker normalization. You could try both # if you wanted. 'l2_normalization': FLAGS.speaker_id_name is None, 'speaker_id_name': FLAGS.speaker_id_name, }) # Make and run beam pipeline. p = beam.Pipeline() _ = (p | 'MakeCollection' >> beam.Create(exp_params) | 'CalcScores' >> beam.Map( lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d))) | 'FormatText' >> beam.Map(format_text_line) | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1)) result = p.run() result.wait_until_finish()
def get_data(file_pattern, reader, samples_key, min_length, label_key, label_list, batch_size, loop_forever, shuffle, shuffle_buffer_size=10000, label_type=tf.int64): """Gets the data for TRILL finetuning. This function is *always* stochastic. Args: file_pattern: Glob for input data. reader: Class used to parse data on disk. samples_key: Name of audio samples in tf.Examples. min_length: The minimum audio length. Should take sample rate into account. Examples smaller than this are dropped. Examples longer than this are randomly cropped to this size. label_key: Name of label key in tf.Examples. label_list: Python list of all possible label values. batch_size: Batch size of data in returned tf.data.Dataset. loop_forever: Python bool. Whether to loop forever. shuffle: Python bool. Whether to shuffle data. shuffle_buffer_size: Size of shuffle buffer. label_type: Type of label field. Usually `tf.string` or `tf.int64`. Returns: A tf.data.Dataset of (samples, onehot labels). """ assert file_utils.Glob(file_pattern), file_pattern # Audio samples are variable length. features = { samples_key: tf.io.VarLenFeature(dtype=tf.float32), label_key: tf.io.FixedLenFeature(shape=(), dtype=label_type, default_value=None), } # Load data into a dataset of batch size 1. Then preprocess. ds = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=1, num_epochs=None if loop_forever else 1, reader_num_threads=tf.data.experimental.AUTOTUNE, parser_num_threads=tf.data.experimental.AUTOTUNE, features=features, reader=reader, shuffle=shuffle, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=tf.data.experimental.AUTOTUNE, sloppy_ordering=True) ds = tf_data_pipeline(ds, samples_key, label_key, label_list, min_length, batch_size) return ds
def itervalues(): for path in file_utils.Glob(glob): for raw_str in tf.python_io.tf_record_iterator(path): example = proto() example.ParseFromString(raw_str) yield example
def get_data(file_pattern, reader, embedding_name, embedding_dim, preaveraged, label_name, label_list, batch_size, loop_forever, shuffle, shuffle_buffer_size=10000): """Gets the data for keras training. Note that if `preaveraged=False` and `batch_size>1`, batches will be cut to the shortest length and data will be lost. Args: file_pattern: Glob for input data. reader: Class used to parse data on disk. embedding_name: Name of embedding in tf.Examples. embedding_dim: Fixed size of embedding. preaveraged: Python bool. If `True`, expect embeddings to be of size (1, embedding_dim). Otherwise, it's (var len, embedding_dim). label_name: Name of label key in tf.Examples. label_list: Python list of all possible label values. batch_size: Batch size of data in returned tf.data.Dataset. loop_forever: Python bool. Whether to loop forever. shuffle: Python bool. Whether to shuffle data. shuffle_buffer_size: Size of shuffle buffer. Returns: A tf.data.Dataset of (embeddings, onehot labels). """ assert file_utils.Glob(file_pattern), file_pattern emb_key = f'embedding/{embedding_name}' label_key = label_name # Preaveraged embeddings are fixed length, non-preaveraged are variable size. if preaveraged: emb_feat = tf.io.FixedLenFeature(shape=(1, embedding_dim), dtype=tf.float32) else: emb_feat = tf.io.VarLenFeature(dtype=tf.float32) features = { emb_key: emb_feat, label_key: tf.io.FixedLenFeature(shape=(), dtype=tf.string, default_value=None), } # Load data into a dataset. ds = tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=batch_size, num_epochs=None if loop_forever else 1, reader_num_threads=tf.data.experimental.AUTOTUNE, parser_num_threads=2, features=features, reader=reader, shuffle=shuffle, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size= batch_size, # consider tf.data.experimental.AUTOTUNE sloppy_ordering=True) ds = ds.map(lambda kv: (kv[emb_key], kv[label_key]), num_parallel_calls=tf.data.experimental.AUTOTUNE) if preaveraged: reshape_fn = _reshape_preaveraged else: reshape_fn = functools.partial(_reshape_full, embedding_dim=embedding_dim) ds = ds.map(reshape_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.map(functools.partial(_y_to_onehot, label_list=label_list), num_parallel_calls=tf.data.experimental.AUTOTUNE) return ds
def get_data(file_pattern, reader, embedding_name, embedding_dim, label_name, label_list, bucket_boundaries, bucket_batch_sizes, loop_forever, shuffle, shuffle_buffer_size=10000): """Gets the data for keras training. Args: file_pattern: Glob for input data. reader: Class used to parse data on disk. embedding_name: Name of embedding in tf.Examples. embedding_dim: Fixed size of embedding. label_name: Name of label key in tf.Examples. label_list: Python list of all possible label values. bucket_boundaries: Boundaries for bucketing. bucket_batch_sizes: Batch size for bucket size. loop_forever: Python bool. Whether to loop forever. shuffle: Python bool. Whether to shuffle data. shuffle_buffer_size: Size of shuffle buffer. Returns: A tf.data.Dataset of (embeddings, onehot labels). """ assert file_utils.Glob(file_pattern), file_pattern assert isinstance(bucket_boundaries, (tuple, list)) if isinstance(bucket_boundaries[0], str): bucket_boundaries = [int(x) for x in bucket_boundaries] emb_key = f'embedding/{embedding_name}' label_key = label_name features = { emb_key: tf.io.VarLenFeature(dtype=tf.float32), label_key: tf.io.FixedLenFeature(shape=(), dtype=tf.string, default_value=None), } # Load data into a dataset. ds = (tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=1, num_epochs=None if loop_forever else 1, reader_num_threads=tf.data.experimental.AUTOTUNE, parser_num_threads=2, features=features, reader=reader, shuffle=shuffle, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=tf.data.experimental.AUTOTUNE, sloppy_ordering=True).map( lambda kv: (kv[emb_key], kv[label_key]), num_parallel_calls=tf.data.experimental.AUTOTUNE).map( functools.partial(_reshape_full, embedding_dim=embedding_dim), num_parallel_calls=tf.data.experimental.AUTOTUNE).map( functools.partial(_y_to_onehot, label_list=label_list), num_parallel_calls=tf.data.experimental.AUTOTUNE). map(_remove_batchdim_one, num_parallel_calls=tf.data.experimental.AUTOTUNE).apply( tf.data.experimental.bucket_by_sequence_length( element_length_func=lambda emb, lbl: tf.shape(emb)[0], bucket_boundaries=bucket_boundaries, bucket_batch_sizes=bucket_batch_sizes, padded_shapes=None, padding_values=None, pad_to_bucket_boundary=False, no_padding=False, drop_remainder=False))) return ds
def get_data(file_pattern, output_dimension, reader, samples_key, min_length, batch_size, loop_forever, shuffle, teacher_fn=None, target_key=None, shuffle_buffer_size=10000): """Gets data for TRILL distillation from a teacher or precomputed values. Args: file_pattern: Glob for input data. output_dimension: Feature dimension of teacher output. reader: Class used to parse data on disk. samples_key: Name of audio samples in tf.Examples. min_length: The minimum audio length. Should take sample rate into account. Examples smaller than this are dropped. Examples longer than this are randomly cropped to this size. If we are using precomputed targets, drop examples that aren't equal to this. batch_size: Batch size of data in returned tf.data.Dataset. loop_forever: Python bool. Whether to loop forever. shuffle: Python bool. Whether to shuffle data. teacher_fn: Optional. A function that takes 1 argument and returns label embeddings. If `None`, get precomputed data from disk. If present, run teacher function as part of the input pipeline. If `teacher_fn` is `None`, `target_key` must be not None. target_key: Required if reading precomputed features. Location of the target embeddings. shuffle_buffer_size: Size of shuffle buffer. Returns: A tf.data.Dataset of (audio samples, regression targets). """ assert file_utils.Glob(file_pattern), file_pattern if teacher_fn is None: assert target_key # Use precomputed targets. We trust the data generation process to create # inputs of the right size, so use fixed-length input for samples. features = { samples_key: tf.io.FixedLenFeature([min_length], tf.float32), target_key: tf.io.FixedLenFeature([output_dimension], tf.float32), } cur_batch_size = batch_size def _rename_dict(kv): return {SAMPLES_: kv[samples_key], TARGETS_: kv[target_key]} else: assert target_key is None features = { samples_key: tf.io.VarLenFeature(dtype=tf.float32), } cur_batch_size = 1 def _rename_dict(kv): return {SAMPLES_: kv[samples_key]} # Load data into a dataset of batch size 1, then preprocess if necessary. ds = (tf.data.experimental.make_batched_features_dataset( file_pattern=file_pattern, batch_size=cur_batch_size, num_epochs=None if loop_forever else 1, reader_num_threads=tf.data.experimental.AUTOTUNE, parser_num_threads=tf.data.experimental.AUTOTUNE, features=features, reader=reader, shuffle=shuffle, shuffle_buffer_size=shuffle_buffer_size, prefetch_buffer_size=tf.data.experimental.AUTOTUNE, sloppy_ordering=True).map( _rename_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE)) if teacher_fn is not None: # Create target embeddings from `teacher_fn`. assert callable(teacher_fn) @tf.function def _audio_to_embeddings(samples): return _audio_to_embeddings_fn(samples, teacher_fn, output_dimension) ds = (ds.filter(lambda kv: _filter_fn(kv, min_length)).map( lambda kv: _crop_fn(kv, min_length), num_parallel_calls=tf.data.experimental.AUTOTUNE).batch( batch_size).map( _audio_to_embeddings, num_parallel_calls=tf.data.experimental.AUTOTUNE)) # Convert results to tuple. ds = (ds.map(lambda kv: (kv[SAMPLES_], kv[TARGETS_]), num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(2)) assert len(ds.element_spec) == 2, ds.element_spec ds.element_spec[0].shape.assert_is_compatible_with([None, min_length ]) # audio samples ds.element_spec[1].shape.assert_is_compatible_with( [None, output_dimension]) # teacher embeddings return ds