def test_save_conf_mat(self, mock_dataframe): # ideally mock out the actual contents written to file, but # would be difficult to get this completely worked out. expected_conf_mat = np.array([ [1, 0, 1], [1, 0, 0], [0, 1, 2], ]) expected_row_col_names = dict( columns=['pred:PAD', 'pred:BACKGROUND', 'pred:OTHER'], index=['true:PAD', 'true:BACKGROUND', 'true:OTHER'] ) mock_instance_df = mock.Mock(spec=pd.DataFrame)() mock_dataframe.return_value = mock_instance_df # still omit bc confusion mat should include all despite omit f1, f1_report = labeler_utils.evaluate_accuracy( self.y_pred, self.y_true, self.num_labels, self.reverse_label_mapping, omitted_labels=['PAD'], verbose=False, confusion_matrix_file='test.csv' ) self.assertTrue((mock_dataframe.call_args[0][0] == expected_conf_mat).all()) self.assertDictEqual( expected_row_col_names, mock_dataframe.call_args[1]) mock_instance_df.to_csv.assert_called()
def test_omit_2_classes(self): expected_output = { 'OTHER': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'micro avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'macro avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'weighted avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, } f1, f1_report = labeler_utils.evaluate_accuracy( self.y_pred, self.y_true, self.num_labels, self.reverse_label_mapping, verbose=False) self.assertEqual(2 / 3, f1) self.assertDictEqual(expected_output, f1_report)
def _validate_training(self, val_data, batch_size_test=32, verbose_log=True, verbose_keras=False): """ Validate the model on the test set and return the evaluation metrics. :param val_data: data generator for the validation :type val_data: iterator :param batch_size_test: Number of samples to process in testing :type batch_size_test: int :param verbose_log: whether or not to print out scores for training, etc. :type verbose_log: bool :param verbose_keras: whether or not to print out scores for training, from keras. :type verbose_keras: bool return (f1-score, f1 report). """ f1 = None f1_report = None if val_data is None: return f1, f1_report # Predict on the test set batch_id = 0 y_val_pred = [] y_val_test = [] for x_val, y_val in val_data: y_val_pred.append( self._model.predict(x_val, batch_size=batch_size_test, verbose=verbose_keras)[1]) y_val_test.append(np.argmax(y_val, axis=-1)) batch_id += 1 sys.stdout.flush() if verbose_log: sys.stdout.write("\rEPOCH %g, validation_batch_id %d" % (self._epoch_id, batch_id)) tf.keras.backend.set_floatx('float32') # Clean the predicted entities and the actual entities f1, f1_report = labeler_utils.evaluate_accuracy( np.concatenate(y_val_pred, axis=0), np.concatenate(y_val_test, axis=0), self.num_labels, self.reverse_label_mapping, verbose=verbose_keras) return f1, f1_report
def test_no_support_classes(self): expected_output = { 'OTHER': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'NO_SUPPORT': { 'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0, }, 'NO_SUPPORT2': { 'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 0, }, 'micro avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'macro avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'weighted avg': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, } reverse_label_mapping = self.reverse_label_mapping.copy() reverse_label_mapping[3] = 'NO_SUPPORT' reverse_label_mapping[4] = 'NO_SUPPORT2' f1, f1_report = labeler_utils.evaluate_accuracy( self.y_pred, self.y_true, self.num_labels + 2, reverse_label_mapping, verbose=False) self.assertEqual(2 / 3, f1) self.assertDictEqual(expected_output, f1_report)
def test_verbose(self, mock_stdout): f1, f1_report = labeler_utils.evaluate_accuracy( self.y_pred, self.y_true, self.num_labels, self.reverse_label_mapping, omitted_labels=[], verbose=True) self.assertIn('PAD', mock_stdout.getvalue()) self.assertIn('BACKGROUND', mock_stdout.getvalue()) self.assertIn('OTHER', mock_stdout.getvalue()) self.assertIn('weighted avg', mock_stdout.getvalue()) self.assertIn('accuracy', mock_stdout.getvalue()) self.assertIn('macro avg', mock_stdout.getvalue()) self.assertIn('support', mock_stdout.getvalue()) self.assertIn('f1-score ', mock_stdout.getvalue()) self.assertIn('F1 Score: ', mock_stdout.getvalue())
def test_no_omit_class(self): expected_output = { 'PAD': { 'precision': 1/2, 'recall': 1/2, 'f1-score': 1/2, 'support': 2, }, 'BACKGROUND': { 'precision': 0, 'recall': 0, 'f1-score': 0, 'support': 1, }, 'OTHER': { 'precision': 2 / 3, 'recall': 2 / 3, 'f1-score': 2 / 3, 'support': 3, }, 'accuracy': 0.5, 'macro avg': { 'precision': (1/2 + 2/3) / 3, 'recall': (1/2 + 2/3) / 3, 'f1-score': (1/2 + 2/3) / 3, 'support': 6, }, 'weighted avg': { 'precision': 1 / 2, 'recall': 1 / 2, 'f1-score': 1 / 2, 'support': 6, }, } f1, f1_report = labeler_utils.evaluate_accuracy( self.y_pred, self.y_true, self.num_labels, self.reverse_label_mapping, omitted_labels=[], verbose=False) self.assertEqual((1/2 + 2/3) / 3, f1) self.assertDictEqual(expected_output, f1_report)