Exemple #1
0
def read_split_file(filename):
    """Read data splits written by write_split_file().

  Args:
    filename: The filename to read from.

  Returns:
    Dictionary {split_name: set(ids)}
  """
    with file_utils.Open(filename, 'r') as f:
        data_splits = json.load(f)
    return {k: set(v) for k, v in data_splits.items()}
Exemple #2
0
def write_split_file(filename, data_splits):
    """Write data splits to a file in json format.

  Args:
    filename: The filename to write to.
    data_splits: {split_name: set(ids)}

  Returns:
    Nothing. Output is written to disk.
  """
    data_splits = {k: list(v) for k, v in data_splits.items()}
    with file_utils.Open(filename, 'w') as f:
        json.dump(data_splits, f)
Exemple #3
0
def main(unused_argv):
  trill_layer = hub.KerasLayer(
      handle=FLAGS.trill_location,
      trainable=False,
      arguments={'sample_rate': 16000},
      output_key='embedding',
      output_shape=[None, 2048]
  )
  with file_utils.Open(FLAGS.sklearn_location, 'rb') as f:
    sklearn_model = pickle.load(f)
  combined_model = combine_models(trill_layer, sklearn_model)

  for seed in range(20):
    test_models_equal(trill_layer, sklearn_model, combined_model, seed, runs=10)

  tf.keras.models.save_model(combined_model, FLAGS.output_filepath)
Exemple #4
0
def train_and_get_score(embedding_name,
                        label_name,
                        label_list,
                        train_glob,
                        eval_glob,
                        test_glob,
                        model_name,
                        l2_normalization,
                        speaker_id_name=None,
                        save_model_dir=None):
    """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.
    save_model_dir: If not `None`, write sklearn models to this directory.

  Returns:
    A Python float, of the accuracy on the eval set.
  """
    def _cur_s(s):
        return time.time() - s

    def _cur_m(s):
        return (time.time() - s) / 60.0

    # Read and validate data.
    def _read_glob(glob, name):
        s = time.time()
        npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                   label_name, label_list,
                                                   l2_normalization,
                                                   speaker_id_name)
        logging.info('Finished reading %s data: %.2f sec.', name, _cur_s(s))
        return npx, npy

    npx_train, npy_train = _read_glob(train_glob, 'train')
    npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
    npx_test, npy_test = _read_glob(test_glob, 'test')

    # Sanity check npx_*.
    assert npx_train.size > 0
    assert npx_eval.size > 0
    assert npx_test.size > 0

    # Sanity check npy_train.
    assert npy_train.size > 0
    assert np.unique(npy_train).size > 1
    # Sanity check npy_eval.
    assert npy_eval.size > 0
    assert np.unique(npy_eval).size > 1
    # Sanity check npy_test.
    assert npy_test.size > 0
    assert np.unique(npy_test).size > 1

    # Train models.
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model.')

    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %.2f min', _cur_m(s))

    # Eval.
    eval_score = d.score(npx_eval, npy_eval)
    logging.info('%s: %.3f', model_name, eval_score)

    # Test.
    test_score = d.score(npx_test, npy_test)
    logging.info('%s: %.3f', model_name, test_score)

    # If `save_model_dir` is present, write model to this directory.
    # To load the model after saving, use:
    # ```python
    # with file_utils.Open(model_filename, 'rb') as f:
    #   m = pickle.load(f)
    # ```
    if save_model_dir:
        file_utils.MaybeMakeDirs(save_model_dir)
        model_filename = os.path.join(save_model_dir, f'{model_name}.pickle')
        with file_utils.Open(model_filename, 'wb') as f:
            pickle.dump(d, f)

    return (eval_score, test_score)
Exemple #5
0
def train_and_get_score(embedding_name,
                        label_name,
                        label_list,
                        train_glob,
                        eval_glob,
                        test_glob,
                        model_name,
                        l2_normalization,
                        speaker_id_name=None,
                        save_model_dir=None,
                        save_predictions_dir=None,
                        eval_metric='accuracy'):
    """Train and eval sklearn models on data.

  Args:
    embedding_name: Name of embedding.
    label_name: Name of label to use.
    label_list: Python list of all values for label.
    train_glob: Location of training data, as tf.Examples.
    eval_glob: Location of eval data, as tf.Examples.
    test_glob: Location of test data, as tf.Examples.
    model_name: Name of model.
    l2_normalization: Python bool. If `True`, normalize embeddings by L2 norm.
    speaker_id_name: `None`, or name of speaker ID field.
    save_model_dir: If not `None`, write sklearn models to this directory.
    save_predictions_dir: If not `None`, write numpy array of predictions on
      train, eval, and test into this directory.
    eval_metric: String name of the desired evaluation metric.

  Returns:
    A tuple of Python floats, (eval metric, test metric).
  """
    def _cur_s(s):
        return time.time() - s

    def _cur_m(s):
        return (time.time() - s) / 60.0

    # Read and validate data.
    def _read_glob(glob, name):
        s = time.time()
        npx, npy = sklearn_utils.tfexamples_to_nps(glob, embedding_name,
                                                   label_name, label_list,
                                                   l2_normalization,
                                                   speaker_id_name)
        logging.info('Finished reading %s %s data: %.2f sec.', embedding_name,
                     name, _cur_s(s))
        return npx, npy

    npx_train, npy_train = _read_glob(train_glob, 'train')
    npx_eval, npy_eval = _read_glob(eval_glob, 'eval')
    npx_test, npy_test = _read_glob(test_glob, 'test')

    # Sanity check npx_*.
    assert npx_train.size > 0
    assert npx_eval.size > 0
    assert npx_test.size > 0

    # Sanity check npy_train.
    assert npy_train.size > 0
    assert np.unique(npy_train).size > 1
    # Sanity check npy_eval.
    assert npy_eval.size > 0
    assert np.unique(npy_eval).size > 1
    # Sanity check npy_test.
    assert npy_test.size > 0
    assert np.unique(npy_test).size > 1

    # Train models.
    d = models.get_sklearn_models()[model_name]()
    logging.info('Made model: %s.', model_name)

    s = time.time()
    d.fit(npx_train, npy_train)
    logging.info('Trained model: %s, %s: %.2f min', model_name, embedding_name,
                 _cur_m(s))

    eval_score, test_score = _calc_eval_scores(eval_metric, d, npx_eval,
                                               npy_eval, npx_test, npy_test)
    logging.info('Finished eval: %s: %.3f', model_name, eval_score)
    logging.info('Finished eval: %s: %.3f', model_name, test_score)

    # If `save_model_dir` is present, write model to this directory.
    # To load the model after saving, use:
    # ```python
    # with file_utils.Open(model_filename, 'rb') as f:
    #   m = pickle.load(f)
    # ```
    if save_model_dir:
        cur_models_dir = os.path.join(save_model_dir, embedding_name)
        file_utils.MaybeMakeDirs(cur_models_dir)
        model_filename = os.path.join(cur_models_dir, f'{model_name}.pickle')
        with file_utils.Open(model_filename, 'wb') as f:
            pickle.dump(d, f)

    if save_predictions_dir:
        cur_preds_dir = os.path.join(save_predictions_dir, embedding_name)
        file_utils.MaybeMakeDirs(cur_preds_dir)
        for dat_name, dat_x, dat_y in [('train', npx_train, npy_train),
                                       ('eval', npx_eval, npy_eval),
                                       ('test', npx_test, npy_test)]:
            pred_filename = os.path.join(cur_preds_dir,
                                         f'{model_name}_{dat_name}_pred.npz')
            pred_y = d.predict(dat_x)
            with file_utils.Open(pred_filename, 'wb') as f:
                np.save(f, pred_y)
            y_filename = os.path.join(cur_preds_dir,
                                      f'{model_name}_{dat_name}_y.npz')
            with file_utils.Open(y_filename, 'wb') as f:
                np.save(f, dat_y)

    return (eval_score, test_score)