def main(unused_argv):
    # Validate flags and setup directories.
    utils.validate_flags(FLAGS.train_glob, FLAGS.eval_glob, FLAGS.test_glob,
                         FLAGS.output_file)

    # Generate experiment parameters based on flags.
    exp_params = utils.experiment_params(
        FLAGS.embedding_list,
        FLAGS.speaker_id_name,
        FLAGS.label_name,
        FLAGS.label_list,
        FLAGS.train_glob,
        FLAGS.eval_glob,
        FLAGS.test_glob,
        FLAGS.save_model_dir,
        FLAGS.save_predictions_dir,
        FLAGS.eval_metric,
    )

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >> beam.Map(lambda d:
                                        (d, utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
def main(unused_argv):

    # Data prep setup.
    prep_params, input_filenames_list, output_filenames, run_data_prep = _get_data_prep_params_from_flags(
    )
    logging.info('beam_params: %s', prep_params)

    # Generate sklearn eval experiment parameters based on data prep flags.
    # Make (data_prep outputs / eval input filenames) globs.
    train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
    sklearn_results_output_file = FLAGS.results_output_file
    exp_params = sklearn_utils.experiment_params(
        train_glob=train_glob,
        eval_glob=eval_glob,
        test_glob=test_glob,
        embedding_list=prep_params['embedding_names'],
        speaker_id_name=FLAGS.speaker_id_key,
        label_name=FLAGS.label_key,
        label_list=FLAGS.label_list,
        save_model_dir=FLAGS.save_model_dir,
        save_predictions_dir=FLAGS.save_predictions_dir,
        eval_metrics=FLAGS.eval_metrics,
    )
    logging.info('exp_params: %s', exp_params)

    # Make and run beam pipeline.
    beam_options = None

    if run_data_prep:
        input_filenames_list, output_filenames = _remove_existing_outputs(
            input_filenames_list, output_filenames)
        logging.info('Data prep on: %s, %s...', input_filenames_list,
                     output_filenames)
        with beam.Pipeline(beam_options) as root:
            for i, (input_filenames_or_glob, output_filename) in enumerate(
                    zip(input_filenames_list, output_filenames)):
                utils.data_prep_pipeline(
                    root=root,
                    input_filenames_or_glob=input_filenames_or_glob,
                    output_filename=output_filename,
                    data_prep_behavior=FLAGS.data_prep_behavior,
                    beam_params=prep_params,
                    suffix=str(i))

    # Check that previous beam pipeline wrote outputs.
    sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                                 sklearn_results_output_file)
    logging.info('Eval sklearn...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d: (d, sklearn_utils.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(
                 sklearn_results_output_file, num_shards=1))
def main(unused_argv):

  # Data prep setup.
  run_data_prep = True
  if FLAGS.train_input_glob:
    assert FLAGS.validation_input_glob
    assert FLAGS.test_input_glob
    input_filenames_list, output_filenames = [], []
    for input_glob in [
        FLAGS.train_input_glob, FLAGS.validation_input_glob,
        FLAGS.test_input_glob,
    ]:
      FLAGS.input_glob = input_glob
      cur_inputs, cur_outputs, beam_params = data_prep_utils.get_beam_params_from_flags(
      )
      input_filenames_list.extend(cur_inputs)
      output_filenames.extend(cur_outputs)
  else:
    input_filenames_list, output_filenames, beam_params = data_prep_utils.get_beam_params_from_flags(
    )
  assert input_filenames_list, input_filenames_list
  assert output_filenames, output_filenames
  try:
    # Check that inputs and flags are formatted correctly.
    data_prep_utils.validate_inputs(
        input_filenames_list, output_filenames,
        beam_params['embedding_modules'], beam_params['embedding_names'],
        beam_params['module_output_keys'])
  except ValueError:
    if FLAGS.skip_existing_error:
      run_data_prep = False
    else:
      raise
  logging.info('beam_params: %s', beam_params)

  # Generate sklearn eval experiment parameters based on data prep flags.
  if len(output_filenames) != 3:
    raise ValueError(f'Data prep output must be 3 files: {output_filenames}')
  # Make them globs.
  train_glob, eval_glob, test_glob = [f'{x}*' for x in output_filenames]
  sklearn_results_output_file = FLAGS.results_output_file
  exp_params = sklearn_utils.experiment_params(
      embedding_list=beam_params['embedding_names'],
      speaker_id_name=FLAGS.speaker_id_key,
      label_name=FLAGS.label_key,
      label_list=FLAGS.label_list,
      train_glob=train_glob,
      eval_glob=eval_glob,
      test_glob=test_glob,
      save_model_dir=None,
      save_predictions_dir=None,
      eval_metric=FLAGS.eval_metric,
  )
  logging.info('exp_params: %s', exp_params)

  # Make and run beam pipeline.
  beam_options = None

  if run_data_prep:
    logging.info('Data prep on: %s, %s...', input_filenames_list,
                 output_filenames)
    with beam.Pipeline(beam_options) as root:
      for i, (input_filenames_or_glob, output_filename) in enumerate(
          zip(input_filenames_list, output_filenames)):
        data_prep_utils.make_beam_pipeline(
            root,
            input_filenames=input_filenames_or_glob,
            output_filename=output_filename,
            suffix=str(i),
            **beam_params)

  # Check that previous beam pipeline wrote outputs.
  sklearn_utils.validate_flags(train_glob, eval_glob, test_glob,
                               sklearn_results_output_file)
  logging.info('Eval sklearn...')
  with beam.Pipeline(beam_options) as root:
    _ = (
        root
        | 'MakeCollection' >> beam.Create(exp_params)
        | 'CalcScores' >> beam.Map(
            lambda d: (d, sklearn_utils.train_and_get_score(**d)))
        | 'FormatText' >> beam.Map(sklearn_utils.format_text_line)
        | 'Reshuffle' >> beam.Reshuffle()
        | 'WriteOutput' >> beam.io.WriteToText(
            sklearn_results_output_file, num_shards=1))