def testDPrimeSanity(self):
   auc = metrics.calculate_auc([0, 0, 1, 1],
                               np.array([[0.1, 0.2, 0.8, 0.9],
                                         [0.9, 0.8, 0.2, 0.1]]).transpose())
   metrics.dprime_from_auc(auc)
   auc = metrics.calculate_auc([0, 0, 1, 1],
                               np.array([[0.9, 0.8, 0.2, 0.1],
                                         [0.1, 0.2, 0.8, 0.9]]).transpose())
   metrics.dprime_from_auc(auc)
Exemple #2
0
def _calc_eval_scores(eval_metric, d, npx_eval, npy_eval, npx_test, npy_test):
    """Compute desired metric on eval and test."""
    if eval_metric == 'equal_error_rate':
        # Eval.
        regression_output = d.predict_proba(npx_eval)[:, 1]  # Prob of class 1.
        eval_score = metrics.calculate_eer(npy_eval, regression_output)
        # Test.
        regression_output = d.predict_proba(npx_test)[:, 1]  # Prob of class 1.
        test_score = metrics.calculate_eer(npy_test, regression_output)
    elif eval_metric == 'accuracy':
        # Eval.
        eval_score = d.score(npx_eval, npy_eval)
        # Test.
        test_score = d.score(npx_test, npy_test)
    elif eval_metric == 'balanced_accuracy':
        # Eval.
        pred_eval = d.predict(npx_eval)
        eval_score = metrics.balanced_accuracy(npy_eval, pred_eval)
        # Test.
        pred_test = d.predict(npx_test)
        test_score = metrics.balanced_accuracy(npy_test, pred_test)
    elif eval_metric == 'unweighted_average_recall':
        # The accuracy per class divided by the number of classes without
        # considerations of instances per class.
        def _class_scores(npx, npy):
            class_scores = []
            for lbl in np.unique(npy):
                i = npy == lbl
                class_scores.append(d.score(npx[i], npy[i]))
            return class_scores

        eval_score = np.mean(_class_scores(npx_eval, npy_eval))
        test_score = np.mean(_class_scores(npx_test, npy_test))
    elif eval_metric == 'auc':
        # Eval.
        regression_output = d.predict_proba(npx_eval)[:, 1]  # Prob of class 1.
        eval_score = metrics.calculate_auc(labels=npy_eval,
                                           predictions=regression_output)
        # Test.
        regression_output = d.predict_proba(npx_test)[:, 1]  # Prob of class 1.
        test_score = metrics.calculate_auc(labels=npy_test,
                                           predictions=regression_output)
    else:
        raise ValueError(f'`eval_metric` not recognized: {eval_metric}')
    return eval_score, test_score
Exemple #3
0
 def testAUCSanity(self):
     metrics.calculate_auc([0, 0, 1, 1],
                           np.array([[0.1, 0.2, 0.8, 0.9],
                                     [0.9, 0.8, 0.2, 0.1]]).transpose())
     metrics.calculate_auc([0, 0, 1, 1],
                           np.array([[0.9, 0.8, 0.2, 0.1],
                                     [0.1, 0.2, 0.8, 0.9]]).transpose())
     metrics.calculate_auc([0, 1, 1, 2],
                           np.array([[0.1, 0.2, 0.7, 0.8],
                                     [0.5, 0.6, 0.1, 0.1],
                                     [0.4, 0.2, 0.2, 0.1]]).transpose(),
                           binary_classification=False)
     metrics.calculate_auc([0, 1, 1, 2],
                           np.array([[0.8, 0.7, 0.2, 0.1],
                                     [0.1, 0.1, 0.6, 0.5],
                                     [0.1, 0.2, 0.2, 0.4]]).transpose(),
                           binary_classification=False)
     metrics.calculate_auc([0, 1, 1, 2],
                           np.array([[0.1, 0.2, 0.7, 0.8],
                                     [0.5, 0.6, 0.1, 0.1],
                                     [0.4, 0.2, 0.2, 0.1]]).transpose(),
                           binary_classification=False,
                           multi_class='ovo')
     metrics.calculate_auc([0, 1, 1, 2],
                           np.array([[0.8, 0.7, 0.2, 0.1],
                                     [0.1, 0.1, 0.6, 0.5],
                                     [0.1, 0.2, 0.2, 0.4]]).transpose(),
                           binary_classification=False,
                           multi_class='ovo')
def eval_and_report():
  """Eval on voxceleb."""
  logging.info('embedding_name: %s', FLAGS.embedding_name)
  logging.info('Logdir: %s', FLAGS.logdir)
  logging.info('Batch size: %s', FLAGS.batch_size)

  writer = tf.summary.create_file_writer(FLAGS.eval_dir)
  num_classes = len(FLAGS.label_list)
  model = models.get_keras_model(
      num_classes, FLAGS.use_batch_normalization,
      num_clusters=FLAGS.num_clusters, alpha_init=FLAGS.alpha_init)
  checkpoint = tf.train.Checkpoint(model=model)

  for ckpt in tf.train.checkpoints_iterator(
      FLAGS.logdir, timeout=FLAGS.timeout):
    assert 'ckpt-' in ckpt, ckpt
    step = ckpt.split('ckpt-')[-1]
    logging.info('Starting to evaluate step: %s.', step)

    checkpoint.restore(ckpt)

    logging.info('Loaded weights for eval step: %s.', step)

    reader = tf.data.TFRecordDataset
    ds = get_data.get_data(
        file_pattern=FLAGS.file_pattern,
        reader=reader,
        embedding_name=FLAGS.embedding_name,
        embedding_dim=FLAGS.embedding_dimension,
        label_name=FLAGS.label_name,
        label_list=FLAGS.label_list,
        bucket_boundaries=FLAGS.bucket_boundaries,
        bucket_batch_sizes=[FLAGS.batch_size] * (len(FLAGS.bucket_boundaries) + 1),  # pylint:disable=line-too-long
        loop_forever=False,
        shuffle=False)
    logging.info('Got dataset for eval step: %s.', step)
    if FLAGS.take_fixed_data:
      ds = ds.take(FLAGS.take_fixed_data)

    acc_m = tf.keras.metrics.Accuracy()
    xent_m = tf.keras.metrics.CategoricalCrossentropy(from_logits=True)

    logging.info('Starting the ds loop...')
    count, ex_count = 0, 0
    all_logits, all_real = [], []
    s = time.time()
    for emb, y_onehot in ds:
      emb.shape.assert_has_rank(3)
      assert emb.shape[2] == FLAGS.embedding_dimension
      y_onehot.shape.assert_has_rank(2)
      assert y_onehot.shape[1] == len(FLAGS.label_list)

      logits = model(emb, training=False)
      all_logits.extend(logits.numpy()[:, 1])
      all_real.extend(y_onehot.numpy()[:, 1])
      acc_m.update_state(y_true=tf.argmax(y_onehot, 1),
                         y_pred=tf.argmax(logits, 1))
      xent_m.update_state(y_true=y_onehot, y_pred=logits)
      ex_count += logits.shape[0]
      count += 1
      logging.info('Saw %i examples after %i iterations as %.2f secs...',
                   ex_count, count,
                   time.time() - s)
    if FLAGS.calculate_equal_error_rate:
      eer_score = metrics.calculate_eer(all_real, all_logits)
    auc_score = metrics.calculate_auc(all_real, all_logits)
    dprime_score = metrics.dprime_from_auc(auc_score)
    with writer.as_default():
      tf.summary.scalar('accuracy', acc_m.result().numpy(), step=int(step))
      tf.summary.scalar('xent_loss', xent_m.result().numpy(), step=int(step))
      tf.summary.scalar('auc', auc_score, step=int(step))
      tf.summary.scalar('dprime', dprime_score, step=int(step))
      if FLAGS.calculate_equal_error_rate:
        tf.summary.scalar('eer', eer_score, step=int(step))
    logging.info('Done with eval step: %s in %.2f secs.', step, time.time() - s)
def eval_and_report():
    """Eval on voxceleb."""
    tf.logging.info('samples_key: %s', FLAGS.samples_key)
    logging.info('Logdir: %s', FLAGS.logdir)
    logging.info('Batch size: %s', FLAGS.batch_size)

    writer = tf.summary.create_file_writer(FLAGS.eval_dir)
    num_classes = len(FLAGS.label_list)
    model = models.get_keras_model(num_classes,
                                   FLAGS.ubn,
                                   num_clusters=FLAGS.nc,
                                   alpha_init=FLAGS.alpha_init)
    checkpoint = tf.train.Checkpoint(model=model)

    for ckpt in tf.train.checkpoints_iterator(FLAGS.logdir,
                                              timeout=FLAGS.timeout):
        assert 'ckpt-' in ckpt, ckpt
        step = ckpt.split('ckpt-')[-1]
        logging.info('Starting to evaluate step: %s.', step)

        checkpoint.restore(ckpt)

        logging.info('Loaded weights for eval step: %s.', step)

        reader = tf.data.TFRecordDataset
        ds = get_data.get_data(file_pattern=FLAGS.file_pattern,
                               reader=reader,
                               samples_key=FLAGS.samples_key,
                               min_length=FLAGS.min_length,
                               label_key=FLAGS.label_key,
                               label_list=FLAGS.label_list,
                               batch_size=FLAGS.batch_size,
                               loop_forever=False,
                               shuffle=False)
        logging.info('Got dataset for eval step: %s.', step)
        if FLAGS.take_fixed_data:
            ds = ds.take(FLAGS.take_fixed_data)

        acc_m = tf.keras.metrics.Accuracy()
        xent_m = tf.keras.metrics.CategoricalCrossentropy(from_logits=True)

        logging.info('Starting the ds loop...')
        count, ex_count = 0, 0
        s = time.time()
        all_logits, all_real = [], []
        for wav_samples, y_onehot in ds:
            wav_samples.shape.assert_is_compatible_with(
                [None, FLAGS.min_length])
            y_onehot.shape.assert_is_compatible_with(
                [None, len(FLAGS.label_list)])

            logits = model(wav_samples, training=False)
            all_logits.extend(logits.numpy()[:, 1])
            all_real.extend(y_onehot.numpy()[:, 1])
            acc_m.update_state(y_true=tf.argmax(y_onehot, 1),
                               y_pred=tf.argmax(logits, 1))
            xent_m.update_state(y_true=y_onehot, y_pred=logits)
            ex_count += logits.shape[0]
            count += 1
            logging.info('Saw %i examples after %i iterations as %.2f secs...',
                         ex_count, count,
                         time.time() - s)
        if FLAGS.calculate_equal_error_rate:
            eer_score = metrics.calculate_eer(all_real, all_logits)
        auc_score = metrics.calculate_auc(all_real, all_logits)
        dprime_score = metrics.dprime_from_auc(auc_score)
        with writer.as_default():
            tf.summary.scalar('accuracy',
                              acc_m.result().numpy(),
                              step=int(step))
            tf.summary.scalar('xent_loss',
                              xent_m.result().numpy(),
                              step=int(step))
            tf.summary.scalar('auc', auc_score, step=int(step))
            tf.summary.scalar('dprime', dprime_score, step=int(step))
            if FLAGS.calculate_equal_error_rate:
                tf.summary.scalar('eer', eer_score, step=int(step))
        logging.info('Done with eval step: %s in %.2f secs.', step,
                     time.time() - s)
Exemple #6
0
 def testDPrimeSanity(self):
     auc = metrics.calculate_auc([0, 0, 1, 1], [0.1, 0.2, 0.8, 0.9])
     metrics.dprime_from_auc(auc)
     auc = metrics.calculate_auc([0, 0, 1, 1], [0.9, 0.8, 0.2, 0.1])
     metrics.dprime_from_auc(auc)
Exemple #7
0
 def testAUCSanity(self):
     metrics.calculate_auc([0, 0, 1, 1], [0.1, 0.2, 0.8, 0.9])
     metrics.calculate_auc([0, 0, 1, 1], [0.9, 0.8, 0.2, 0.1])
def _calc_scores(eval_metric, d, npx_eval,
                 npy_eval, npx_test,
                 npy_test,
                 label_list):
  """Compute desired metric on eval and test."""
  if eval_metric == 'equal_error_rate':
    # Eval.
    regression_output = d.predict_proba(npx_eval)[:, 1]  # Prob of class 1.
    eval_score = metrics.calculate_eer(npy_eval, regression_output)
    # Test.
    regression_output = d.predict_proba(npx_test)[:, 1]  # Prob of class 1.
    test_score = metrics.calculate_eer(npy_test, regression_output)
  elif eval_metric == 'accuracy':
    # Eval.
    eval_score = d.score(npx_eval, npy_eval)
    # Test.
    test_score = d.score(npx_test, npy_test)
  elif eval_metric == 'balanced_accuracy':
    # Eval.
    pred_eval = d.predict(npx_eval)
    eval_score = metrics.balanced_accuracy(npy_eval, pred_eval)
    # Test.
    pred_test = d.predict(npx_test)
    test_score = metrics.balanced_accuracy(npy_test, pred_test)
  elif eval_metric == 'unweighted_average_recall':
    # The accuracy per class divided by the number of classes without
    # considerations of instances per class.
    def _class_scores(npx, npy):
      class_scores = []
      for lbl in np.unique(npy):
        i = npy == lbl
        class_scores.append(d.score(npx[i], npy[i]))
      return class_scores
    eval_score = np.mean(_class_scores(npx_eval, npy_eval))
    test_score = np.mean(_class_scores(npx_test, npy_test))
  elif eval_metric == 'auc':
    binary_classification = (len(label_list) == 2)
    regression_eval = d.predict_proba(npx_eval)
    eval_score = metrics.calculate_auc(
        labels=npy_eval,
        predictions=regression_eval,
        binary_classification=binary_classification,
        multi_class='ovr')
    regression_test = d.predict_proba(npx_test)
    test_score = metrics.calculate_auc(
        labels=npy_test,
        predictions=regression_test,
        binary_classification=binary_classification,
        multi_class='ovr')
  elif eval_metric in ['dprime', 'dprime_ovo']:
    multi_class = 'ovo' if eval_metric == 'dprime_ovo' else 'ovr'
    binary_classification = (len(label_list) == 2)
    regression_eval = d.predict_proba(npx_eval)
    eval_auc = metrics.calculate_auc(
        labels=npy_eval,
        predictions=regression_eval,
        binary_classification=binary_classification,
        multi_class=multi_class)
    regression_test = d.predict_proba(npx_test)
    test_auc = metrics.calculate_auc(
        labels=npy_test,
        predictions=regression_test,
        binary_classification=binary_classification,
        multi_class=multi_class)
    eval_score = metrics.dprime_from_auc(eval_auc)
    test_score = metrics.dprime_from_auc(test_auc)
  elif eval_metric == 'f1_score':
    # Eval.
    pred_eval = d.predict(npx_eval)
    eval_score = metrics.f1_score(labels=npy_eval, predictions=pred_eval)
    # Test.
    pred_test = d.predict(npx_test)
    test_score = metrics.f1_score(labels=npy_test, predictions=pred_test)
  else:
    raise ValueError(f'`eval_metric` not recognized: {eval_metric}')
  return eval_score, test_score