def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):

        def _params_dict(l2_normalization,
                         speaker_id_name=FLAGS.speaker_id_name,
                         elem=elem):
            return {
                'embedding_name': elem[0],
                'model_name': elem[1],
                'label_name': FLAGS.label_name,
                'label_list': FLAGS.label_list,
                'train_glob': FLAGS.train_glob,
                'eval_glob': FLAGS.eval_glob,
                'test_glob': FLAGS.test_glob,
                'l2_normalization': l2_normalization,
                'speaker_id_name': speaker_id_name,
                'save_model_dir': FLAGS.save_model_dir,
                'save_predictions_dir': FLAGS.save_predictions_dir,
                'eval_metric': FLAGS.eval_metric,
            }

        exp_params.append(_params_dict(l2_normalization=True))
        exp_params.append(_params_dict(l2_normalization=False))
        if FLAGS.speaker_id_name is not None:
            exp_params.append(
                _params_dict(l2_normalization=True, speaker_id_name=None))
            exp_params.append(
                _params_dict(l2_normalization=False, speaker_id_name=None))

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d:
                      (d, train_and_eval_sklearn.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
Ejemplo n.º 2
0
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
        exp_params.append({
            'embedding_name':
            elem[0],
            'model_name':
            elem[1],
            'label_name':
            FLAGS.label_name,
            'label_list':
            FLAGS.label_list,
            'train_glob':
            FLAGS.train_glob,
            'eval_glob':
            FLAGS.eval_glob,
            'test_glob':
            FLAGS.test_glob,
            # Either L2 normalization or speaker normalization. You could try both
            # if you wanted.
            'l2_normalization':
            FLAGS.speaker_id_name is None,
            'speaker_id_name':
            FLAGS.speaker_id_name,
            'save_model_dir':
            FLAGS.save_model_dir,
            'calculate_equal_error_rate':
            FLAGS.calculate_equal_error_rate,
        })

    # Make and run beam pipeline.
    beam_options = None

    logging.info('Starting to create flume pipeline...')
    with beam.Pipeline(beam_options) as root:
        _ = (root
             | 'MakeCollection' >> beam.Create(exp_params)
             | 'CalcScores' >>
             beam.Map(lambda d:
                      (d, train_and_eval_sklearn.train_and_get_score(**d)))
             | 'FormatText' >> beam.Map(format_text_line)
             | 'Reshuffle' >> beam.Reshuffle()
             | 'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file,
                                                    num_shards=1))
def experiment_params(
    embedding_list,
    speaker_id_name,
    label_name,
    label_list,
    train_glob,
    eval_glob,
    test_glob,
    save_model_dir,
    save_predictions_dir,
    eval_metrics,
    comma_escape_char = '?',
):
  """Get experiment params."""
  # Sometimes we want commas to appear in `embedding_modules`,
  # `embedding_names`, or `module_output_key`. However, commas get split out in
  # Google's Python `DEFINE_list`. We compromise by introducing a special
  # character, which we replace with commas here.
  embedding_list = _maybe_add_commas(embedding_list, comma_escape_char)

  # Enumerate the configurations we want to run.
  exp_params = []
  model_names = models.get_sklearn_models().keys()
  for elem in itertools.product(*[embedding_list, model_names]):

    def _params_dict(l2_normalization,
                     speaker_id_name=speaker_id_name,
                     elem=elem):
      return {
          'embedding_name': elem[0],
          'model_name': elem[1],
          'label_name': label_name,
          'label_list': label_list,
          'train_glob': train_glob,
          'eval_glob': eval_glob,
          'test_glob': test_glob,
          'l2_normalization': l2_normalization,
          'speaker_id_name': speaker_id_name,
          'save_model_dir': save_model_dir,
          'save_predictions_dir': save_predictions_dir,
          'eval_metrics': eval_metrics,
      }

    exp_params.append(_params_dict(l2_normalization=True))
    exp_params.append(_params_dict(l2_normalization=False))
    if speaker_id_name is not None:
      exp_params.append(
          _params_dict(l2_normalization=True, speaker_id_name=None))
      exp_params.append(
          _params_dict(l2_normalization=False, speaker_id_name=None))

  return exp_params
Ejemplo n.º 4
0
    def test_sklearn_models_sanity(self, model_name):
        # Set random seed according to:
        # https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development.
        np.random.seed(42)
        rn.seed(42)

        model = models.get_sklearn_models()[model_name]()

        # Actually train.
        inputs, targets = _get_some_data(9000)
        model.fit(inputs, targets)

        # Check that performance is near perfect.
        inputs, targets = _get_some_data(512)
        acc = model.score(inputs, targets)
        expected = 0.5 if 'forest' in model_name.lower() else 0.9
        self.assertGreater(acc, expected)
        logging.info('%s final acc: %f', model, acc)
Ejemplo n.º 5
0
def main(unused_argv):
    assert file_utils.Glob(FLAGS.train_glob), FLAGS.train_glob
    assert file_utils.Glob(FLAGS.eval_glob), FLAGS.eval_glob
    assert file_utils.Glob(FLAGS.test_glob), FLAGS.test_glob

    # Create output directory if it doesn't already exist.
    outdir = os.path.dirname(FLAGS.output_file)
    file_utils.MaybeMakeDirs(outdir)

    # Enumerate the configurations we want to run.
    exp_params = []
    model_names = models.get_sklearn_models().keys()
    for elem in itertools.product(*[FLAGS.embedding_list, model_names]):
        exp_params.append({
            'embedding_name': elem[0],
            'model_name': elem[1],
            'label_name': FLAGS.label_name,
            'label_list': FLAGS.label_list,
            'train_glob': FLAGS.train_glob,
            'eval_glob': FLAGS.eval_glob,
            'test_glob': FLAGS.test_glob,
            # Either L2 normalization or speaker normalization. You could try both
            # if you wanted.
            'l2_normalization': FLAGS.speaker_id_name is None,
            'speaker_id_name': FLAGS.speaker_id_name,
        })

    # Make and run beam pipeline.
    p = beam.Pipeline()
    _ = (p
         | 'MakeCollection' >> beam.Create(exp_params)
         | 'CalcScores' >> beam.Map(
             lambda d: (d, train_and_eval_sklearn.train_and_get_score(**d)))
         | 'FormatText' >> beam.Map(format_text_line)
         |
         'WriteOutput' >> beam.io.WriteToText(FLAGS.output_file, num_shards=1))
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 6
0
def train_and_get_score(embedding_name,
                        label_name,
                        label_list,
                        train_glob,
                        eval_glob,
                        test_glob,
                        model_name,
                        l2_normalization,
                        speaker_id_name=None,
                        save_model_dir=None):
    """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.
    save_model_dir: If not `None`, write sklearn models to this directory.

  Returns:
    A Python float, of the accuracy on the eval set.
  """
    def _cur_s(s):
        return time.time() - s

    def _cur_m(s):
        return (time.time() - s) / 60.0

    # Read and validate data.
    def _read_glob(glob, name):
        s = time.time()
        npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                   label_name, label_list,
                                                   l2_normalization,
                                                   speaker_id_name)
        logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s))
        return npx, npy

    npx_train, npy_train = _read_glob(train_glob, 'train')
    npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
    npx_test, npy_test = _read_glob(test_glob, 'test')

    # Sanity check npx_*.
    assert npx_train.size > 0
    assert npx_eval.size > 0
    assert npx_test.size > 0

    # Sanity check npy_train.
    assert npy_train.size > 0
    assert np.unique(npy_train).size > 1
    # Sanity check npy_eval.
    assert npy_eval.size > 0
    assert np.unique(npy_eval).size > 1
    # Sanity check npy_test.
    assert npy_test.size > 0
    assert np.unique(npy_test).size > 1

    # Train models.
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model.')

    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %.2f min', _cur_m(s))

    # Eval.
    eval_score = d.score(npx_eval, npy_eval)
    logging.info('%s: %.3f', model_name, eval_score)

    # Test.
    test_score = d.score(npx_test, npy_test)
    logging.info('%s: %.3f', model_name, test_score)

    # If `save_model_dir` is present, write model to this directory.
    # To load the model after saving, use:
    # ```python
    # with file_utils.Open(model_filename, 'rb') as f:
    #   m = pickle.load(f)
    # ```
    if save_model_dir:
        file_utils.MaybeMakeDirs(save_model_dir)
        model_filename = os.path.join(save_model_dir, f'{model_name}.pickle')
        with file_utils.Open(model_filename, 'wb') as f:
            pickle.dump(d, f)

    return (eval_score, test_score)
def train_and_get_score(
    embedding_name,
    label_name,
    label_list,
    train_glob,
    eval_glob,
    test_glob,
    model_name,
    l2_normalization,
    speaker_id_name = None,
    save_model_dir = None,
    save_predictions_dir = None,
    eval_metrics = ('accuracy',)
):
  """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.
    save_model_dir: If not `None`, write sklearn models to this directory.
    save_predictions_dir: If not `None`, write numpy array of predictions on
      train, eval, and test into this directory.
    eval_metrics: Iterable of string names of the desired evaluation metrics.

  Returns:
    A dict: {metric name: (eval metric, test metric)}
  """
  def _cur_s(s):
    return time.time() - s
  def _cur_m(s):
    return (time.time() - s) / 60.0

  # Read and validate data.
  def _read_glob(glob, name):
    logging.info('Starting to read %s: %s', name, glob)
    s = time.time()
    npx, npy, _ = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                  label_name, label_list,
                                                  l2_normalization,
                                                  speaker_id_name)
    logging.info('Finished reading %s %s data: %.2f sec.', embedding_name, name,
                 _cur_s(s))
    return npx, npy
  npx_train, npy_train = _read_glob(train_glob, 'train')
  npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
  npx_test, npy_test = _read_glob(test_glob, 'test')

  # Sanity check npx_*.
  assert npx_train.size > 0
  assert npx_eval.size > 0
  assert npx_test.size > 0

  # Sanity check npy_train.
  assert npy_train.size > 0
  assert np.unique(npy_train).size > 1
  # Sanity check npy_eval.
  assert npy_eval.size > 0
  assert np.unique(npy_eval).size > 1
  # Sanity check npy_test.
  assert npy_test.size > 0
  assert np.unique(npy_test).size > 1

  # If `save_model_dir` is present and the model exists, load the model instead
  # of training.
  if save_model_dir:
    cur_models_dir = os.path.join(save_model_dir, embedding_name)
    tf.io.gfile.makedirs(cur_models_dir)
    model_filename = os.path.join(cur_models_dir, f'{model_name}.pickle')
    train_model = not tf.io.gfile.exists(model_filename)
  else:
    train_model = True

  # Train models.
  if train_model:
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model: %s.', model_name)
    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %s, %s: %.2f min', model_name, embedding_name,
                 _cur_m(s))
    # If `save_model_dir` is present and the model exists, write model to this
    # directory.
    if save_model_dir:
      with tf.io.gfile.GFile(model_filename, 'wb') as f:
        pickle.dump(d, f)
  else:  # Load model.
    with tf.io.gfile.GFile(model_filename, 'rb') as f:
      d = pickle.load(f)

  scores = {}
  for eval_metric in eval_metrics:
    eval_score, test_score = _calc_scores(eval_metric, d, npx_eval, npy_eval,
                                          npx_test, npy_test, label_list)
    logging.info('Finished eval: %s: %.3f', model_name, eval_score)
    logging.info('Finished test: %s: %.3f', model_name, test_score)
    scores[eval_metric] = (eval_score, test_score)

  if save_predictions_dir:
    cur_preds_dir = os.path.join(save_predictions_dir, embedding_name)
    tf.io.gfile.makedirs(cur_preds_dir)
    for dat_name, dat_x, dat_y in [('train', npx_train, npy_train),
                                   ('eval', npx_eval, npy_eval),
                                   ('test', npx_test, npy_test)]:
      pred_filename = os.path.join(cur_preds_dir,
                                   f'{model_name}_{dat_name}_pred.npz')
      pred_y = d.predict(dat_x)
      with tf.io.gfile.GFile(pred_filename, 'wb') as f:
        np.save(f, pred_y)
      y_filename = os.path.join(cur_preds_dir,
                                f'{model_name}_{dat_name}_y.npz')
      with tf.io.gfile.GFile(y_filename, 'wb') as f:
        np.save(f, dat_y)

  return scores
Ejemplo n.º 8
0
def train_and_get_score(embedding_name,
                        label_name,
                        label_list,
                        train_glob,
                        eval_glob,
                        test_glob,
                        model_name,
                        l2_normalization,
                        speaker_id_name=None,
                        save_model_dir=None,
                        save_predictions_dir=None,
                        eval_metric='accuracy'):
    """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.
    save_model_dir: If not `None`, write sklearn models to this directory.
    save_predictions_dir: If not `None`, write numpy array of predictions on
      train, eval, and test into this directory.
    eval_metric: String name of the desired evaluation metric.

  Returns:
    A tuple of Python floats, (eval metric, test metric).
  """
    def _cur_s(s):
        return time.time() - s

    def _cur_m(s):
        return (time.time() - s) / 60.0

    # Read and validate data.
    def _read_glob(glob, name):
        s = time.time()
        npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                   label_name, label_list,
                                                   l2_normalization,
                                                   speaker_id_name)
        logging.info('Finished reading %s %s data: %.2f sec.', embedding_name,
                     name, _cur_s(s))
        return npx, npy

    npx_train, npy_train = _read_glob(train_glob, 'train')
    npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
    npx_test, npy_test = _read_glob(test_glob, 'test')

    # Sanity check npx_*.
    assert npx_train.size > 0
    assert npx_eval.size > 0
    assert npx_test.size > 0

    # Sanity check npy_train.
    assert npy_train.size > 0
    assert np.unique(npy_train).size > 1
    # Sanity check npy_eval.
    assert npy_eval.size > 0
    assert np.unique(npy_eval).size > 1
    # Sanity check npy_test.
    assert npy_test.size > 0
    assert np.unique(npy_test).size > 1

    # Train models.
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model: %s.', model_name)

    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %s, %s: %.2f min', model_name, embedding_name,
                 _cur_m(s))

    eval_score, test_score = _calc_eval_scores(eval_metric, d, npx_eval,
                                               npy_eval, npx_test, npy_test)
    logging.info('Finished eval: %s: %.3f', model_name, eval_score)
    logging.info('Finished eval: %s: %.3f', model_name, test_score)

    # If `save_model_dir` is present, write model to this directory.
    # To load the model after saving, use:
    # ```python
    # with file_utils.Open(model_filename, 'rb') as f:
    #   m = pickle.load(f)
    # ```
    if save_model_dir:
        cur_models_dir = os.path.join(save_model_dir, embedding_name)
        file_utils.MaybeMakeDirs(cur_models_dir)
        model_filename = os.path.join(cur_models_dir, f'{model_name}.pickle')
        with file_utils.Open(model_filename, 'wb') as f:
            pickle.dump(d, f)

    if save_predictions_dir:
        cur_preds_dir = os.path.join(save_predictions_dir, embedding_name)
        file_utils.MaybeMakeDirs(cur_preds_dir)
        for dat_name, dat_x, dat_y in [('train', npx_train, npy_train),
                                       ('eval', npx_eval, npy_eval),
                                       ('test', npx_test, npy_test)]:
            pred_filename = os.path.join(cur_preds_dir,
                                         f'{model_name}_{dat_name}_pred.npz')
            pred_y = d.predict(dat_x)
            with file_utils.Open(pred_filename, 'wb') as f:
                np.save(f, pred_y)
            y_filename = os.path.join(cur_preds_dir,
                                      f'{model_name}_{dat_name}_y.npz')
            with file_utils.Open(y_filename, 'wb') as f:
                np.save(f, dat_y)

    return (eval_score, test_score)
Ejemplo n.º 9
0
def train_and_get_score(embedding_name,
                        label_name,
                        label_list,
                        train_glob,
                        eval_glob,
                        test_glob,
                        model_name,
                        l2_normalization,
                        speaker_id_name=None):
    """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.

  Returns:
    A Python float, of the accuracy on the eval set.
  """
    def _cur_s(s):
        return time.time() - s

    def _cur_m(s):
        return (time.time() - s) / 60.0

    # Read and validate data.
    def _read_glob(glob, name):
        s = time.time()
        npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                   label_name, label_list,
                                                   l2_normalization,
                                                   speaker_id_name)
        logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s))
        return npx, npy

    npx_train, npy_train = _read_glob(train_glob, 'train')
    npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
    npx_test, npy_test = _read_glob(test_glob, 'test')

    # Sanity check npx_*.
    assert npx_train.size > 0
    assert npx_eval.size > 0
    assert npx_test.size > 0

    # Sanity check npy_train.
    assert npy_train.size > 0
    assert np.unique(npy_train).size > 1
    # Sanity check npy_eval.
    assert npy_eval.size > 0
    assert np.unique(npy_eval).size > 1
    # Sanity check npy_test.
    assert npy_test.size > 0
    assert np.unique(npy_test).size > 1

    # Train models.
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model.')

    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %.2f min', _cur_m(s))

    # Eval.
    eval_score = d.score(npx_eval, npy_eval)
    logging.info('%s: %.3f', model_name, eval_score)

    # Test.
    test_score = d.score(npx_test, npy_test)
    logging.info('%s: %.3f', model_name, test_score)

    return (eval_score, test_score)