def read_input_glob_and_sample_rate_from_flags(input_glob_flag,
                                               sample_rate_flag,
                                               tfds_dataset_flag,
                                               output_filename_flag,
                                               tfds_data_dir_flag):
    """Read flags for input data and sample rate.

  Args:
    input_glob_flag: String flag. The input file glob.
    sample_rate_flag: String flag. The sample rate.
    tfds_dataset_flag: String flag. The TFDS dataset.
    output_filename_flag: String flag. The output filename.
    tfds_data_dir_flag: String flag. Optional location of local TFDS data.

  Returns:
    (input_filenames, output_filenames, sample_rate)
    `input_filenames` is a list of list of filenames. `output_filenames` is a
    list of the same length.
  """
    if input_glob_flag:
        assert file_utils.Glob(input_glob_flag), input_glob_flag
        assert not tfds_data_dir_flag
        input_filenames = [file_utils.Glob(input_glob_flag)]
        output_filenames = [output_filename_flag]
        sample_rate = sample_rate_flag
    else:
        assert tfds_dataset_flag
        dataset_name = tfds_dataset_flag
        # Download dataset, if necessary.
        tfds.load(dataset_name, data_dir=tfds_data_dir_flag)
        sample_rate = _tfds_sample_rate(dataset_name, tfds_data_dir_flag)
        assert sample_rate, sample_rate

        input_filenames = []
        output_filenames = []
        for split_name in ('train', 'validation', 'test'):
            input_filenames.append(
                _tfds_filenames(dataset_name, split_name, tfds_data_dir_flag))
            output_filenames.append(output_filename_flag + f'.{split_name}')

        logging.info('TFDS input filenames: %s', input_filenames)
        logging.info('sample rate: %s', sample_rate)

    if sample_rate:
        assert isinstance(sample_rate, numbers.Number)

    for filename_list in input_filenames:
        for filename in filename_list:
            assert tf.io.gfile.exists(filename), filename
    assert len(input_filenames) == len(output_filenames)

    return input_filenames, output_filenames, sample_rate
def main(unused_argv):
  assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
  assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
  assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

  # Create output directory if it doesn't already exist.
  outdir = os.path.dirname(FLAGS.output_file)
  file_utils.MaybeMakeDirs(outdir)

  # Enumerate the configurations we want to run.
  exp_params = []
  model_names = models.get_sklearn_models().keys()
  for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
    def _params_dict(
        l2_normalization, speaker_id_name=FLAGS.speaker_id_name, elem=elem):
      return {
          'embedding_name': elem[0],
          'model_name': elem[1],
          'label_name': FLAGS.label_name,
          'label_list': FLAGS.label_list,
          'train_glob': FLAGS.train_glob,
          'eval_glob': FLAGS.eval_glob,
          'test_glob': FLAGS.test_glob,
          'l2_normalization': l2_normalization,
          'speaker_id_name': speaker_id_name,
          'save_model_dir': FLAGS.save_model_dir,
          'eval_metric': FLAGS.eval_metric,
      }
    exp_params.append(_params_dict(l2_normalization=True))
    exp_params.append(_params_dict(l2_normalization=False))
    if FLAGS.speaker_id_name is not None:
      exp_params.append(
          _params_dict(l2_normalization=True, speaker_id_name=None))
      exp_params.append(
          _params_dict(l2_normalization=False, speaker_id_name=None))

  # Make and run beam pipeline.
  beam_options = None

  logging.info('Starting to create flume pipeline...')
  with beam.Pipeline(beam_options) as root:
    _ = (root
         | 'MakeCollection' >> beam.Create(exp_params)
         | 'CalcScores' >> beam.Map(
             lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d)))
         | 'FormatText' >> beam.Map(format_text_line)
         | 'Reshuffle' >> beam.Reshuffle()
         | 'WriteOutput' >> beam.io.WriteToText(
             FLAGS.output_file, num_shards=1)
        )
Example #3
0
def validate_inputs(
    input_filenames_list, output_filenames, embedding_modules, embedding_names,
    module_output_keys):
  """Validate inputs and input flags."""
  for filename_list in input_filenames_list:
    for filename in filename_list:
      assert tf.io.gfile.exists(filename), filename
  assert len(input_filenames_list) == len(output_filenames)

  # Make sure output files don't already exist.
  for output_filename in output_filenames:
    assert not file_utils.Glob(f'{output_filename}*'), output_filename

  # Lengths of flag lists must be the same.
  assert len(embedding_names) == len(embedding_modules),\
         (embedding_names, embedding_modules)
  assert len(embedding_modules) == len(module_output_keys),\
         (embedding_modules, module_output_keys)
  # Shortnames must be unique.
  assert len(set(embedding_names)) == len(embedding_names), embedding_names

  # Create output directory if it doesn't already exist.
  for output_filename in output_filenames:
    output_dir = output_filename.rsplit('/', 1)[0]
    file_utils.MaybeMakeDirs(output_dir)
Example #4
0
def get_data(file_pattern,
             teacher_fn,
             output_dimension,
             reader,
             samples_key,
             min_length,
             batch_size,
             loop_forever,
             shuffle,
             shuffle_buffer_size=10000):
    """Gets the data for TRILL distillation.

  This function is *always* stochastic.

  Args:
    file_pattern: Glob for input data.
    teacher_fn: A function that takes 1 argument and returns label embeddings.
    output_dimension: Feature dimension of teacher output.
    reader: Class used to parse data on disk.
    samples_key: Name of audio samples in tf.Examples
    min_length: The minimum audio length. Should take sample rate into account.
      Examples smaller than this are dropped. Examples longer than this are
      randomly cropped to this size.
    batch_size: Batch size of data in returned tf.data.Dataset.
    loop_forever: Python bool. Whether to loop forever.
    shuffle: Python bool. Whether to shuffle data.
    shuffle_buffer_size: Size of shuffle buffer

  Returns:
    A tf.data.Dataset of (audio samples, regression targets).
  """
    assert file_utils.Glob(file_pattern), file_pattern
    assert callable(teacher_fn)

    # Audio samples are variable length.
    features = {
        samples_key: tf.io.VarLenFeature(dtype=tf.float32),
    }

    # Load data into a dataset of batch size 1. Then preprocess.
    ds = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=1,
        num_epochs=None if loop_forever else 1,
        reader_num_threads=tf.data.experimental.AUTOTUNE,
        parser_num_threads=tf.data.experimental.AUTOTUNE,
        features=features,
        reader=reader,
        shuffle=shuffle,
        shuffle_buffer_size=shuffle_buffer_size,
        prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
        sloppy_ordering=True)

    ds = tf_data_pipeline(ds, teacher_fn, samples_key, min_length, batch_size,
                          output_dimension)

    return ds
def main(unused_argv):
  assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
  assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
  assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

  # Create output directory if it doesn't already exist.
  outdir = os.path.dirname(FLAGS.output_file)
  file_utils.MaybeMakeDirs(outdir)

  # Enumerate the configurations we want to run.
  exp_params = []
  model_names = models.get_sklearn_models().keys()
  for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
    exp_params.append({
        'embedding_name': elem[0],
        'model_name': elem[1],
        'label_name': FLAGS.label_name,
        'label_list': FLAGS.label_list,
        'train_glob': FLAGS.train_glob,
        'eval_glob': FLAGS.eval_glob,
        'test_glob': FLAGS.test_glob,
        # Either L2 normalization or speaker normalization. You could try both
        # if you wanted.
        'l2_normalization': FLAGS.speaker_id_name is None,
        'speaker_id_name': FLAGS.speaker_id_name,
        'save_model_dir': FLAGS.save_model_dir,
        'calculate_equal_error_rate': FLAGS.calculate_equal_error_rate,
    })

  # Make and run beam pipeline.
  beam_options = None

  logging.info('Starting to create flume pipeline...')
  with beam.Pipeline(beam_options) as root:
    _ = (root
         | 'MakeCollection' >> beam.Create(exp_params)
         | 'CalcScores' >> beam.Map(
             lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d)))
         | 'FormatText' >> beam.Map(format_text_line)
         | 'Reshuffle' >> beam.Reshuffle()
         | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1)
        )
Example #6
0
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
        exp_params.append({
            'embedding_name': elem[0],
            'model_name': elem[1],
            'label_name': FLAGS.label_name,
            'label_list': FLAGS.label_list,
            'train_glob': FLAGS.train_glob,
            'eval_glob': FLAGS.eval_glob,
            'test_glob': FLAGS.test_glob,
            # Either L2 normalization or speaker normalization. You could try both
            # if you wanted.
            'l2_normalization': FLAGS.speaker_id_name is None,
            'speaker_id_name': FLAGS.speaker_id_name,
        })

    # Make and run beam pipeline.
    p = beam.Pipeline()
    _ = (p
         | 'MakeCollection' >> beam.Create(exp_params)
         | 'CalcScores' >> beam.Map(
             lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d)))
         | 'FormatText' >> beam.Map(format_text_line)
         |
         'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
    result = p.run()
    result.wait_until_finish()
Example #7
0
def get_data(file_pattern,
             reader,
             samples_key,
             min_length,
             label_key,
             label_list,
             batch_size,
             loop_forever,
             shuffle,
             shuffle_buffer_size=10000,
             label_type=tf.int64):
    """Gets the data for TRILL finetuning.

  This function is *always* stochastic.

  Args:
    file_pattern: Glob for input data.
    reader: Class used to parse data on disk.
    samples_key: Name of audio samples in tf.Examples.
    min_length: The minimum audio length. Should take sample rate into account.
      Examples smaller than this are dropped. Examples longer than this are
      randomly cropped to this size.
    label_key: Name of label key in tf.Examples.
    label_list: Python list of all possible label values.
    batch_size: Batch size of data in returned tf.data.Dataset.
    loop_forever: Python bool. Whether to loop forever.
    shuffle: Python bool. Whether to shuffle data.
    shuffle_buffer_size: Size of shuffle buffer.
    label_type: Type of label field. Usually `tf.string` or `tf.int64`.

  Returns:
    A tf.data.Dataset of (samples, onehot labels).
  """
    assert file_utils.Glob(file_pattern), file_pattern

    # Audio samples are variable length.
    features = {
        samples_key:
        tf.io.VarLenFeature(dtype=tf.float32),
        label_key:
        tf.io.FixedLenFeature(shape=(), dtype=label_type, default_value=None),
    }

    # Load data into a dataset of batch size 1. Then preprocess.
    ds = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=1,
        num_epochs=None if loop_forever else 1,
        reader_num_threads=tf.data.experimental.AUTOTUNE,
        parser_num_threads=tf.data.experimental.AUTOTUNE,
        features=features,
        reader=reader,
        shuffle=shuffle,
        shuffle_buffer_size=shuffle_buffer_size,
        prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
        sloppy_ordering=True)

    ds = tf_data_pipeline(ds, samples_key, label_key, label_list, min_length,
                          batch_size)

    return ds
Example #8
0
 def itervalues():
     for path in file_utils.Glob(glob):
         for raw_str in tf.python_io.tf_record_iterator(path):
             example = proto()
             example.ParseFromString(raw_str)
             yield example
Example #9
0
def get_data(file_pattern,
             reader,
             embedding_name,
             embedding_dim,
             preaveraged,
             label_name,
             label_list,
             batch_size,
             loop_forever,
             shuffle,
             shuffle_buffer_size=10000):
    """Gets the data for keras training.

  Note that if `preaveraged=False` and `batch_size>1`, batches will be cut to
  the shortest length and data will be lost.

  Args:
    file_pattern: Glob for input data.
    reader: Class used to parse data on disk.
    embedding_name: Name of embedding in tf.Examples.
    embedding_dim: Fixed size of embedding.
    preaveraged: Python bool. If `True`, expect embeddings to be of size
      (1, embedding_dim). Otherwise, it's (var len, embedding_dim).
    label_name: Name of label key in tf.Examples.
    label_list: Python list of all possible label values.
    batch_size: Batch size of data in returned tf.data.Dataset.
    loop_forever: Python bool. Whether to loop forever.
    shuffle: Python bool. Whether to shuffle data.
    shuffle_buffer_size: Size of shuffle buffer.

  Returns:
    A tf.data.Dataset of (embeddings, onehot labels).
  """
    assert file_utils.Glob(file_pattern), file_pattern
    emb_key = f'embedding/{embedding_name}'
    label_key = label_name

    # Preaveraged embeddings are fixed length, non-preaveraged are variable size.
    if preaveraged:
        emb_feat = tf.io.FixedLenFeature(shape=(1, embedding_dim),
                                         dtype=tf.float32)
    else:
        emb_feat = tf.io.VarLenFeature(dtype=tf.float32)
    features = {
        emb_key:
        emb_feat,
        label_key:
        tf.io.FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
    }

    # Load data into a dataset.
    ds = tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=batch_size,
        num_epochs=None if loop_forever else 1,
        reader_num_threads=tf.data.experimental.AUTOTUNE,
        parser_num_threads=2,
        features=features,
        reader=reader,
        shuffle=shuffle,
        shuffle_buffer_size=shuffle_buffer_size,
        prefetch_buffer_size=
        batch_size,  # consider tf.data.experimental.AUTOTUNE
        sloppy_ordering=True)

    ds = ds.map(lambda kv: (kv[emb_key], kv[label_key]),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if preaveraged:
        reshape_fn = _reshape_preaveraged
    else:
        reshape_fn = functools.partial(_reshape_full,
                                       embedding_dim=embedding_dim)
    ds = ds.map(reshape_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.map(functools.partial(_y_to_onehot, label_list=label_list),
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

    return ds
Example #10
0
def get_data(file_pattern,
             reader,
             embedding_name,
             embedding_dim,
             label_name,
             label_list,
             bucket_boundaries,
             bucket_batch_sizes,
             loop_forever,
             shuffle,
             shuffle_buffer_size=10000):
    """Gets the data for keras training.

  Args:
    file_pattern: Glob for input data.
    reader: Class used to parse data on disk.
    embedding_name: Name of embedding in tf.Examples.
    embedding_dim: Fixed size of embedding.
    label_name: Name of label key in tf.Examples.
    label_list: Python list of all possible label values.
    bucket_boundaries: Boundaries for bucketing.
    bucket_batch_sizes: Batch size for bucket size.
    loop_forever: Python bool. Whether to loop forever.
    shuffle: Python bool. Whether to shuffle data.
    shuffle_buffer_size: Size of shuffle buffer.

  Returns:
    A tf.data.Dataset of (embeddings, onehot labels).
  """
    assert file_utils.Glob(file_pattern), file_pattern
    assert isinstance(bucket_boundaries, (tuple, list))
    if isinstance(bucket_boundaries[0], str):
        bucket_boundaries = [int(x) for x in bucket_boundaries]

    emb_key = f'embedding/{embedding_name}'
    label_key = label_name

    features = {
        emb_key:
        tf.io.VarLenFeature(dtype=tf.float32),
        label_key:
        tf.io.FixedLenFeature(shape=(), dtype=tf.string, default_value=None),
    }

    # Load data into a dataset.
    ds = (tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=1,
        num_epochs=None if loop_forever else 1,
        reader_num_threads=tf.data.experimental.AUTOTUNE,
        parser_num_threads=2,
        features=features,
        reader=reader,
        shuffle=shuffle,
        shuffle_buffer_size=shuffle_buffer_size,
        prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
        sloppy_ordering=True).map(
            lambda kv: (kv[emb_key], kv[label_key]),
            num_parallel_calls=tf.data.experimental.AUTOTUNE).map(
                functools.partial(_reshape_full, embedding_dim=embedding_dim),
                num_parallel_calls=tf.data.experimental.AUTOTUNE).map(
                    functools.partial(_y_to_onehot, label_list=label_list),
                    num_parallel_calls=tf.data.experimental.AUTOTUNE).
          map(_remove_batchdim_one,
              num_parallel_calls=tf.data.experimental.AUTOTUNE).apply(
                  tf.data.experimental.bucket_by_sequence_length(
                      element_length_func=lambda emb, lbl: tf.shape(emb)[0],
                      bucket_boundaries=bucket_boundaries,
                      bucket_batch_sizes=bucket_batch_sizes,
                      padded_shapes=None,
                      padding_values=None,
                      pad_to_bucket_boundary=False,
                      no_padding=False,
                      drop_remainder=False)))

    return ds
Example #11
0
def get_data(file_pattern,
             output_dimension,
             reader,
             samples_key,
             min_length,
             batch_size,
             loop_forever,
             shuffle,
             teacher_fn=None,
             target_key=None,
             shuffle_buffer_size=10000):
    """Gets data for TRILL distillation from a teacher or precomputed values.

  Args:
    file_pattern: Glob for input data.
    output_dimension: Feature dimension of teacher output.
    reader: Class used to parse data on disk.
    samples_key: Name of audio samples in tf.Examples.
    min_length: The minimum audio length. Should take sample rate into account.
      Examples smaller than this are dropped. Examples longer than this are
      randomly cropped to this size. If we are using precomputed targets,
      drop examples that aren't equal to this.
    batch_size: Batch size of data in returned tf.data.Dataset.
    loop_forever: Python bool. Whether to loop forever.
    shuffle: Python bool. Whether to shuffle data.
    teacher_fn: Optional. A function that takes 1 argument and returns label
      embeddings. If `None`, get precomputed data from disk. If present, run
      teacher function as part of the input pipeline. If `teacher_fn` is `None`,
      `target_key` must be not None.
    target_key: Required if reading precomputed features. Location of the target
      embeddings.
    shuffle_buffer_size: Size of shuffle buffer.
  Returns:
    A tf.data.Dataset of (audio samples, regression targets).
  """
    assert file_utils.Glob(file_pattern), file_pattern

    if teacher_fn is None:
        assert target_key
        # Use precomputed targets. We trust the data generation process to create
        # inputs of the right size, so use fixed-length input for samples.
        features = {
            samples_key: tf.io.FixedLenFeature([min_length], tf.float32),
            target_key: tf.io.FixedLenFeature([output_dimension], tf.float32),
        }
        cur_batch_size = batch_size

        def _rename_dict(kv):
            return {SAMPLES_: kv[samples_key], TARGETS_: kv[target_key]}
    else:
        assert target_key is None
        features = {
            samples_key: tf.io.VarLenFeature(dtype=tf.float32),
        }
        cur_batch_size = 1

        def _rename_dict(kv):
            return {SAMPLES_: kv[samples_key]}

    # Load data into a dataset of batch size 1, then preprocess if necessary.
    ds = (tf.data.experimental.make_batched_features_dataset(
        file_pattern=file_pattern,
        batch_size=cur_batch_size,
        num_epochs=None if loop_forever else 1,
        reader_num_threads=tf.data.experimental.AUTOTUNE,
        parser_num_threads=tf.data.experimental.AUTOTUNE,
        features=features,
        reader=reader,
        shuffle=shuffle,
        shuffle_buffer_size=shuffle_buffer_size,
        prefetch_buffer_size=tf.data.experimental.AUTOTUNE,
        sloppy_ordering=True).map(
            _rename_dict, num_parallel_calls=tf.data.experimental.AUTOTUNE))

    if teacher_fn is not None:
        # Create target embeddings from `teacher_fn`.
        assert callable(teacher_fn)

        @tf.function
        def _audio_to_embeddings(samples):
            return _audio_to_embeddings_fn(samples, teacher_fn,
                                           output_dimension)

        ds = (ds.filter(lambda kv: _filter_fn(kv, min_length)).map(
            lambda kv: _crop_fn(kv, min_length),
            num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(
                batch_size).map(
                    _audio_to_embeddings,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE))

    # Convert results to tuple.
    ds = (ds.map(lambda kv: (kv[SAMPLES_], kv[TARGETS_]),
                 num_parallel_calls=tf.data.experimental.AUTOTUNE).prefetch(2))

    assert len(ds.element_spec) == 2, ds.element_spec
    ds.element_spec[0].shape.assert_is_compatible_with([None, min_length
                                                        ])  # audio samples
    ds.element_spec[1].shape.assert_is_compatible_with(
        [None, output_dimension])  # teacher embeddings

    return ds