Exemple #1
0
def main(_):

    input_filenames_list, output_filenames, beam_params = utils.get_beam_params_from_flags(
    )
    # Check that inputs and flags are formatted correctly.
    utils.validate_inputs(input_filenames_list=input_filenames_list,
                          output_filenames=output_filenames,
                          embedding_modules=beam_params['embedding_modules'],
                          embedding_names=beam_params['embedding_names'],
                          module_output_keys=beam_params['module_output_keys'])
    logging.info('main: input_filenames_list: %s', input_filenames_list)
    logging.info('main: output_filenames: %s', output_filenames)
    logging.info('main: beam_params: %s', beam_params)

    # If you have custom beam options, add them here.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        for i, (input_filenames_or_glob, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            utils.data_prep_pipeline(
                root=root,
                input_filenames_or_glob=input_filenames_or_glob,
                output_filename=output_filename,
                data_prep_behavior=FLAGS.data_prep_behavior,
                beam_params=beam_params,
                suffix=str(i))
Exemple #2
0
def main(unused_argv):
    # Get input data location from flags. If we're reading a TFDS dataset, get
    # train, validation, and test.
    input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
        FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset,
        FLAGS.output_filename)

    # Check that inputs and flags are formatted correctly.
    audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list,
                                                   output_filenames,
                                                   FLAGS.embedding_modules,
                                                   FLAGS.embedding_names,
                                                   FLAGS.module_output_keys)

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline() as root:
        for i, (input_filenames, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            audio_to_embeddings_beam_utils.make_beam_pipeline(
                root,
                input_filenames,
                sample_rate,
                FLAGS.debug,
                FLAGS.embedding_names,
                FLAGS.embedding_modules,
                FLAGS.module_output_keys,
                FLAGS.audio_key,
                FLAGS.sample_rate_key,
                FLAGS.label_key,
                FLAGS.speaker_id_key,
                FLAGS.average_over_time,
                FLAGS.delete_audio_from_output,
                output_filename,
                suffix=i)
def _get_data_prep_params_from_flags():
    """Get parameters for data prep pipeline from flags."""
    if not FLAGS.output_filename:
        raise ValueError('Must provide output filename.')
    if not FLAGS.comma_escape_char:
        raise ValueError('`FLAGS.comma_escape_char` must be provided.')

    run_data_prep = True
    if FLAGS.train_input_glob:  # Explicitly pass globs.
        if not FLAGS.validation_input_glob:
            raise ValueError(
                'If using globs, must supply `validation_input_glob.`')
        if not FLAGS.test_input_glob:
            raise ValueError('If using globs, must supply `test_input_glob.`')
        input_filenames_list, output_filenames = [], []
        for input_glob, name in [(FLAGS.train_input_glob, 'train'),
                                 (FLAGS.validation_input_glob, 'validation'),
                                 (FLAGS.test_input_glob, 'test')]:
            FLAGS.input_glob = input_glob
            cur_inputs, cur_outputs, prep_params = utils.get_beam_params_from_flags(
            )
            if len(cur_outputs) != 1:
                raise ValueError(f'`cur_outputs` too long: {cur_outputs}')
            cur_outputs = f'{cur_outputs[0]}.{name}'

            input_filenames_list.extend(cur_inputs)
            output_filenames.append(cur_outputs)
    else:  # Get params from a TFDS dataset.
        if not FLAGS.tfds_dataset:
            raise ValueError(
                'Must supply TFDS dataset name if not globs provided.')
        input_filenames_list, output_filenames, prep_params = utils.get_beam_params_from_flags(
        )
    if len(output_filenames) != 3:
        raise ValueError(
            f'Data prep output must be 3 files: {output_filenames}')

    try:
        # Check that inputs and flags are formatted correctly.
        utils.validate_inputs(input_filenames_list, output_filenames,
                              prep_params['embedding_modules'],
                              prep_params['embedding_names'],
                              prep_params['module_output_keys'])
    except ValueError:
        if FLAGS.skip_existing_error:
            # Check if there are any files left after filtering. Return the expected
            # locations, though, and remove.
            _, output_filenames_filtered = _remove_existing_outputs(
                input_filenames_list, output_filenames)
            if not output_filenames_filtered:
                run_data_prep = False
        else:
            raise

    return prep_params, input_filenames_list, output_filenames, run_data_prep
def main(unused_argv):

    # Get input data location from flags. If we're reading a TFDS dataset, get
    # train, validation, and test.
    input_filenames_list, output_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
        FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset,
        FLAGS.output_filename, FLAGS.tfds_data_dir)

    # Check that inputs and flags are formatted correctly.
    audio_to_embeddings_beam_utils.validate_inputs(input_filenames_list,
                                                   output_filenames,
                                                   FLAGS.embedding_modules,
                                                   FLAGS.embedding_names,
                                                   FLAGS.module_output_keys)

    input_format = 'tfrecord'
    output_format = 'tfrecord'

    # If you have custom beam options, add them here.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        for i, (input_filenames_or_glob, output_filename) in enumerate(
                zip(input_filenames_list, output_filenames)):
            audio_to_embeddings_beam_utils.make_beam_pipeline(
                root,
                input_filenames_or_glob,
                sample_rate,
                FLAGS.debug,
                FLAGS.embedding_names,
                FLAGS.embedding_modules,
                FLAGS.module_output_keys,
                FLAGS.audio_key,
                FLAGS.sample_rate_key,
                FLAGS.label_key,
                FLAGS.speaker_id_key,
                FLAGS.average_over_time,
                FLAGS.delete_audio_from_output,
                output_filename,
                split_embeddings_into_separate_tables=FLAGS.
                split_embeddings_into_separate_tables,  # pylint:disable=line-too-long
                use_frontend_fn=FLAGS.use_frontend_fn,
                model_input_min_length=FLAGS.model_input_min_length,
                input_format=input_format,
                output_format=output_format,
                suffix=i)
Exemple #5
0
def main(unused_argv):
  # Get input data location fromm flags.
  input_filenames, sample_rate = audio_to_embeddings_beam_utils.read_input_glob_and_sample_rate_from_flags(
      FLAGS.input_glob, FLAGS.sample_rate, FLAGS.tfds_dataset)

  # Check that flags are formatted correctly.
  audio_to_embeddings_beam_utils.validate_inputs(
      FLAGS.output_filename, FLAGS.embedding_modules,
      FLAGS.embedding_names, FLAGS.module_output_keys)

  logging.info('Starting to create flume pipeline...')
  with beam.Pipeline() as root:
    audio_to_embeddings_beam_utils.make_beam_pipeline(
        root, input_filenames, sample_rate, FLAGS.debug, FLAGS.embedding_names,
        FLAGS.embedding_modules, FLAGS.module_output_keys, FLAGS.audio_key,
        FLAGS.sample_rate_key, FLAGS.label_key, FLAGS.speaker_id_key,
        FLAGS.average_over_time, FLAGS.delete_audio_from_output,
        FLAGS.output_filename)
Exemple #6
0
 def test_validate_inputs(self, input_glob):
   file_glob = os.path.join(absltest.get_default_test_srcdir(), TEST_DIR, '*')
   if input_glob:
     input_filenames_list = [[file_glob]]
   else:
     filenames = tf.io.gfile.glob(file_glob)
     input_filenames_list = [filenames]
   output_filenames = [
       os.path.join(absltest.get_default_test_tmpdir(), 'fake1')]
   embedding_modules = ['m1', 'm2']
   embedding_names = ['n1', 'n2']
   module_output_keys = ['k1', 'k2']
   # Check that inputs and flags are formatted correctly.
   audio_to_embeddings_beam_utils.validate_inputs(
       input_filenames_list=input_filenames_list,
       output_filenames=output_filenames,
       embedding_modules=embedding_modules,
       embedding_names=embedding_names,
       module_output_keys=module_output_keys)
def main(unused_argv):

  # Data prep setup.
  run_data_prep = True
  if FLAGS.train_input_glob:
    assert FLAGS.validation_input_glob
    assert FLAGS.test_input_glob
    input_filenames_list, output_filenames = [], []
    for input_glob in [
        FLAGS.train_input_glob, FLAGS.validation_input_glob,
        FLAGS.test_input_glob,
    ]:
      FLAGS.input_glob = input_glob
      cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags(
      )
      input_filenames_list.extend(cur_inputs)
      output_filenames.extend(cur_outputs)
  else:
    input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags(
    )
  assert input_filenames_list, input_filenames_list
  assert output_filenames, output_filenames
  try:
    # Check that inputs and flags are formatted correctly.
    data_prep_utils.validate_inputs(
        input_filenames_list, output_filenames,
        beam_params['embedding_modules'], beam_params['embedding_names'],
        beam_params['module_output_keys'])
  except ValueError:
    if FLAGS.skip_existing_error:
      run_data_prep = False
    else:
      raise
  logging.info('beam_params: %s', beam_params)

  # Generate sklearn eval experiment parameters based on data prep flags.
  if len(output_filenames) != 3:
    raise ValueError(f'Data prep output must be 3 files: {output_filenames}')
  # Make them globs.
  train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
  sklearn_results_output_file = FLAGS.results_output_file
  exp_params = sklearn_utils.experiment_params(
      embedding_list=beam_params['embedding_names'],
      speaker_id_name=FLAGS.speaker_id_key,
      label_name=FLAGS.label_key,
      label_list=FLAGS.label_list,
      train_glob=train_glob,
      eval_glob=eval_glob,
      test_glob=test_glob,
      save_model_dir=None,
      save_predictions_dir=None,
      eval_metric=FLAGS.eval_metric,
  )
  logging.info('exp_params: %s', exp_params)

  # Make and run beam pipeline.
  beam_options = None

  if run_data_prep:
    logging.info('Data prep on: %s, %s...', input_filenames_list,
                 output_filenames)
    with beam.Pipeline(beam_options) as root:
      for i, (input_filenames_or_glob, output_filename) in enumerate(
          zip(input_filenames_list, output_filenames)):
        data_prep_utils.make_beam_pipeline(
            root,
            input_filenames=input_filenames_or_glob,
            output_filename=output_filename,
            suffix=str(i),
            **beam_params)

  # Check that previous beam pipeline wrote outputs.
  sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                               sklearn_results_output_file)
  logging.info('Eval sklearn...')
  with beam.Pipeline(beam_options) as root:
    _ = (
        root
        | 'MakeCollection' >> beam.Create(exp_params)
        | 'CalcScores' >> beam.Map(
            lambda d: (d, sklearn_utils.train_and_get_score(**d)))
        | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
        | 'Reshuffle' >> beam.Reshuffle()
        | 'WriteOutput' >> beam.io.WriteToText(
            sklearn_results_output_file, num_shards=1))