def testDPrimeSanity(self): auc = metrics.calculate_auc([0, 0, 1, 1], np.array([[0.1, 0.2, 0.8, 0.9], [0.9, 0.8, 0.2, 0.1]]).transpose()) metrics.dprime_from_auc(auc) auc = metrics.calculate_auc([0, 0, 1, 1], np.array([[0.9, 0.8, 0.2, 0.1], [0.1, 0.2, 0.8, 0.9]]).transpose()) metrics.dprime_from_auc(auc)
def _calc_eval_scores(eval_metric, d, npx_eval, npy_eval, npx_test, npy_test): """Compute desired metric on eval and test.""" if eval_metric == 'equal_error_rate': # Eval. regression_output = d.predict_proba(npx_eval)[:, 1] # Prob of class 1. eval_score = metrics.calculate_eer(npy_eval, regression_output) # Test. regression_output = d.predict_proba(npx_test)[:, 1] # Prob of class 1. test_score = metrics.calculate_eer(npy_test, regression_output) elif eval_metric == 'accuracy': # Eval. eval_score = d.score(npx_eval, npy_eval) # Test. test_score = d.score(npx_test, npy_test) elif eval_metric == 'balanced_accuracy': # Eval. pred_eval = d.predict(npx_eval) eval_score = metrics.balanced_accuracy(npy_eval, pred_eval) # Test. pred_test = d.predict(npx_test) test_score = metrics.balanced_accuracy(npy_test, pred_test) elif eval_metric == 'unweighted_average_recall': # The accuracy per class divided by the number of classes without # considerations of instances per class. def _class_scores(npx, npy): class_scores = [] for lbl in np.unique(npy): i = npy == lbl class_scores.append(d.score(npx[i], npy[i])) return class_scores eval_score = np.mean(_class_scores(npx_eval, npy_eval)) test_score = np.mean(_class_scores(npx_test, npy_test)) elif eval_metric == 'auc': # Eval. regression_output = d.predict_proba(npx_eval)[:, 1] # Prob of class 1. eval_score = metrics.calculate_auc(labels=npy_eval, predictions=regression_output) # Test. regression_output = d.predict_proba(npx_test)[:, 1] # Prob of class 1. test_score = metrics.calculate_auc(labels=npy_test, predictions=regression_output) else: raise ValueError(f'`eval_metric` not recognized: {eval_metric}') return eval_score, test_score
def testAUCSanity(self): metrics.calculate_auc([0, 0, 1, 1], np.array([[0.1, 0.2, 0.8, 0.9], [0.9, 0.8, 0.2, 0.1]]).transpose()) metrics.calculate_auc([0, 0, 1, 1], np.array([[0.9, 0.8, 0.2, 0.1], [0.1, 0.2, 0.8, 0.9]]).transpose()) metrics.calculate_auc([0, 1, 1, 2], np.array([[0.1, 0.2, 0.7, 0.8], [0.5, 0.6, 0.1, 0.1], [0.4, 0.2, 0.2, 0.1]]).transpose(), binary_classification=False) metrics.calculate_auc([0, 1, 1, 2], np.array([[0.8, 0.7, 0.2, 0.1], [0.1, 0.1, 0.6, 0.5], [0.1, 0.2, 0.2, 0.4]]).transpose(), binary_classification=False) metrics.calculate_auc([0, 1, 1, 2], np.array([[0.1, 0.2, 0.7, 0.8], [0.5, 0.6, 0.1, 0.1], [0.4, 0.2, 0.2, 0.1]]).transpose(), binary_classification=False, multi_class='ovo') metrics.calculate_auc([0, 1, 1, 2], np.array([[0.8, 0.7, 0.2, 0.1], [0.1, 0.1, 0.6, 0.5], [0.1, 0.2, 0.2, 0.4]]).transpose(), binary_classification=False, multi_class='ovo')
def eval_and_report(): """Eval on voxceleb.""" logging.info('embedding_name: %s', FLAGS.embedding_name) logging.info('Logdir: %s', FLAGS.logdir) logging.info('Batch size: %s', FLAGS.batch_size) writer = tf.summary.create_file_writer(FLAGS.eval_dir) num_classes = len(FLAGS.label_list) model = models.get_keras_model( num_classes, FLAGS.use_batch_normalization, num_clusters=FLAGS.num_clusters, alpha_init=FLAGS.alpha_init) checkpoint = tf.train.Checkpoint(model=model) for ckpt in tf.train.checkpoints_iterator( FLAGS.logdir, timeout=FLAGS.timeout): assert 'ckpt-' in ckpt, ckpt step = ckpt.split('ckpt-')[-1] logging.info('Starting to evaluate step: %s.', step) checkpoint.restore(ckpt) logging.info('Loaded weights for eval step: %s.', step) reader = tf.data.TFRecordDataset ds = get_data.get_data( file_pattern=FLAGS.file_pattern, reader=reader, embedding_name=FLAGS.embedding_name, embedding_dim=FLAGS.embedding_dimension, label_name=FLAGS.label_name, label_list=FLAGS.label_list, bucket_boundaries=FLAGS.bucket_boundaries, bucket_batch_sizes=[FLAGS.batch_size] * (len(FLAGS.bucket_boundaries) + 1), # pylint:disable=line-too-long loop_forever=False, shuffle=False) logging.info('Got dataset for eval step: %s.', step) if FLAGS.take_fixed_data: ds = ds.take(FLAGS.take_fixed_data) acc_m = tf.keras.metrics.Accuracy() xent_m = tf.keras.metrics.CategoricalCrossentropy(from_logits=True) logging.info('Starting the ds loop...') count, ex_count = 0, 0 all_logits, all_real = [], [] s = time.time() for emb, y_onehot in ds: emb.shape.assert_has_rank(3) assert emb.shape[2] == FLAGS.embedding_dimension y_onehot.shape.assert_has_rank(2) assert y_onehot.shape[1] == len(FLAGS.label_list) logits = model(emb, training=False) all_logits.extend(logits.numpy()[:, 1]) all_real.extend(y_onehot.numpy()[:, 1]) acc_m.update_state(y_true=tf.argmax(y_onehot, 1), y_pred=tf.argmax(logits, 1)) xent_m.update_state(y_true=y_onehot, y_pred=logits) ex_count += logits.shape[0] count += 1 logging.info('Saw %i examples after %i iterations as %.2f secs...', ex_count, count, time.time() - s) if FLAGS.calculate_equal_error_rate: eer_score = metrics.calculate_eer(all_real, all_logits) auc_score = metrics.calculate_auc(all_real, all_logits) dprime_score = metrics.dprime_from_auc(auc_score) with writer.as_default(): tf.summary.scalar('accuracy', acc_m.result().numpy(), step=int(step)) tf.summary.scalar('xent_loss', xent_m.result().numpy(), step=int(step)) tf.summary.scalar('auc', auc_score, step=int(step)) tf.summary.scalar('dprime', dprime_score, step=int(step)) if FLAGS.calculate_equal_error_rate: tf.summary.scalar('eer', eer_score, step=int(step)) logging.info('Done with eval step: %s in %.2f secs.', step, time.time() - s)
def eval_and_report(): """Eval on voxceleb.""" tf.logging.info('samples_key: %s', FLAGS.samples_key) logging.info('Logdir: %s', FLAGS.logdir) logging.info('Batch size: %s', FLAGS.batch_size) writer = tf.summary.create_file_writer(FLAGS.eval_dir) num_classes = len(FLAGS.label_list) model = models.get_keras_model(num_classes, FLAGS.ubn, num_clusters=FLAGS.nc, alpha_init=FLAGS.alpha_init) checkpoint = tf.train.Checkpoint(model=model) for ckpt in tf.train.checkpoints_iterator(FLAGS.logdir, timeout=FLAGS.timeout): assert 'ckpt-' in ckpt, ckpt step = ckpt.split('ckpt-')[-1] logging.info('Starting to evaluate step: %s.', step) checkpoint.restore(ckpt) logging.info('Loaded weights for eval step: %s.', step) reader = tf.data.TFRecordDataset ds = get_data.get_data(file_pattern=FLAGS.file_pattern, reader=reader, samples_key=FLAGS.samples_key, min_length=FLAGS.min_length, label_key=FLAGS.label_key, label_list=FLAGS.label_list, batch_size=FLAGS.batch_size, loop_forever=False, shuffle=False) logging.info('Got dataset for eval step: %s.', step) if FLAGS.take_fixed_data: ds = ds.take(FLAGS.take_fixed_data) acc_m = tf.keras.metrics.Accuracy() xent_m = tf.keras.metrics.CategoricalCrossentropy(from_logits=True) logging.info('Starting the ds loop...') count, ex_count = 0, 0 s = time.time() all_logits, all_real = [], [] for wav_samples, y_onehot in ds: wav_samples.shape.assert_is_compatible_with( [None, FLAGS.min_length]) y_onehot.shape.assert_is_compatible_with( [None, len(FLAGS.label_list)]) logits = model(wav_samples, training=False) all_logits.extend(logits.numpy()[:, 1]) all_real.extend(y_onehot.numpy()[:, 1]) acc_m.update_state(y_true=tf.argmax(y_onehot, 1), y_pred=tf.argmax(logits, 1)) xent_m.update_state(y_true=y_onehot, y_pred=logits) ex_count += logits.shape[0] count += 1 logging.info('Saw %i examples after %i iterations as %.2f secs...', ex_count, count, time.time() - s) if FLAGS.calculate_equal_error_rate: eer_score = metrics.calculate_eer(all_real, all_logits) auc_score = metrics.calculate_auc(all_real, all_logits) dprime_score = metrics.dprime_from_auc(auc_score) with writer.as_default(): tf.summary.scalar('accuracy', acc_m.result().numpy(), step=int(step)) tf.summary.scalar('xent_loss', xent_m.result().numpy(), step=int(step)) tf.summary.scalar('auc', auc_score, step=int(step)) tf.summary.scalar('dprime', dprime_score, step=int(step)) if FLAGS.calculate_equal_error_rate: tf.summary.scalar('eer', eer_score, step=int(step)) logging.info('Done with eval step: %s in %.2f secs.', step, time.time() - s)
def testDPrimeSanity(self): auc = metrics.calculate_auc([0, 0, 1, 1], [0.1, 0.2, 0.8, 0.9]) metrics.dprime_from_auc(auc) auc = metrics.calculate_auc([0, 0, 1, 1], [0.9, 0.8, 0.2, 0.1]) metrics.dprime_from_auc(auc)
def testAUCSanity(self): metrics.calculate_auc([0, 0, 1, 1], [0.1, 0.2, 0.8, 0.9]) metrics.calculate_auc([0, 0, 1, 1], [0.9, 0.8, 0.2, 0.1])
def _calc_scores(eval_metric, d, npx_eval, npy_eval, npx_test, npy_test, label_list): """Compute desired metric on eval and test.""" if eval_metric == 'equal_error_rate': # Eval. regression_output = d.predict_proba(npx_eval)[:, 1] # Prob of class 1. eval_score = metrics.calculate_eer(npy_eval, regression_output) # Test. regression_output = d.predict_proba(npx_test)[:, 1] # Prob of class 1. test_score = metrics.calculate_eer(npy_test, regression_output) elif eval_metric == 'accuracy': # Eval. eval_score = d.score(npx_eval, npy_eval) # Test. test_score = d.score(npx_test, npy_test) elif eval_metric == 'balanced_accuracy': # Eval. pred_eval = d.predict(npx_eval) eval_score = metrics.balanced_accuracy(npy_eval, pred_eval) # Test. pred_test = d.predict(npx_test) test_score = metrics.balanced_accuracy(npy_test, pred_test) elif eval_metric == 'unweighted_average_recall': # The accuracy per class divided by the number of classes without # considerations of instances per class. def _class_scores(npx, npy): class_scores = [] for lbl in np.unique(npy): i = npy == lbl class_scores.append(d.score(npx[i], npy[i])) return class_scores eval_score = np.mean(_class_scores(npx_eval, npy_eval)) test_score = np.mean(_class_scores(npx_test, npy_test)) elif eval_metric == 'auc': binary_classification = (len(label_list) == 2) regression_eval = d.predict_proba(npx_eval) eval_score = metrics.calculate_auc( labels=npy_eval, predictions=regression_eval, binary_classification=binary_classification, multi_class='ovr') regression_test = d.predict_proba(npx_test) test_score = metrics.calculate_auc( labels=npy_test, predictions=regression_test, binary_classification=binary_classification, multi_class='ovr') elif eval_metric in ['dprime', 'dprime_ovo']: multi_class = 'ovo' if eval_metric == 'dprime_ovo' else 'ovr' binary_classification = (len(label_list) == 2) regression_eval = d.predict_proba(npx_eval) eval_auc = metrics.calculate_auc( labels=npy_eval, predictions=regression_eval, binary_classification=binary_classification, multi_class=multi_class) regression_test = d.predict_proba(npx_test) test_auc = metrics.calculate_auc( labels=npy_test, predictions=regression_test, binary_classification=binary_classification, multi_class=multi_class) eval_score = metrics.dprime_from_auc(eval_auc) test_score = metrics.dprime_from_auc(test_auc) elif eval_metric == 'f1_score': # Eval. pred_eval = d.predict(npx_eval) eval_score = metrics.f1_score(labels=npy_eval, predictions=pred_eval) # Test. pred_test = d.predict(npx_test) test_score = metrics.f1_score(labels=npy_test, predictions=pred_test) else: raise ValueError(f'`eval_metric` not recognized: {eval_metric}') return eval_score, test_score