Example #1
0
    def test(self, ts, steps=0, **kwargs):
        """Method that evaluates on some data.  There are 2 modes this can run in, `feed_dict` and `dataset`

        In `feed_dict` mode, the model cycles the test data batch-wise and feeds each batch in with a `feed_dict`.
        In `dataset` mode, the data is still passed in to this method, but it is not passed in a `feed_dict` and is
        mostly superfluous since the features are grafted right onto the graph.  However, we do use it for supplying
        the ground truth, ids and text, so it is essential that the caller does not shuffle the data
        :param ts: The test set
        :param conll_output: (`str`) An optional file output
        :param txts: A list of text data associated with the encoded batch
        :param dataset: (`bool`) Is this using `tf.dataset`s
        :return: The metrics
        """
        SET_TRAIN_FLAG(False)

        total_correct = total_sum = 0
        gold_spans = []
        pred_spans = []

        self.cm = ConfusionMatrix(self.idx2classlabel)

        handle = None
        if kwargs.get("conll_output") is not None and kwargs.get(
                'txts') is not None:
            handle = open(kwargs.get("conll_output"), "w")

        try:
            pg = create_progress_bar(steps)
            metrics = {}
            for (features, y), batch in pg(
                    zip_longest(ts, kwargs.get('batches', []), fillvalue={})):
                correct, count, golds, guesses = self.process_batch(
                    features,
                    y,
                    handle=handle,
                    txts=kwargs.get("txts"),
                    ids=batch.get("ids"))
                total_correct += correct
                total_sum += count
                gold_spans.extend(golds)
                pred_spans.extend(guesses)

            total_acc = total_correct / float(total_sum)
            # Only show the fscore if requested
            metrics['tagging_f1'] = span_f1(gold_spans, pred_spans)
            metrics['tagging_acc'] = total_acc
            metrics.update({
                f"classification_{k}": v
                for k, v in self.cm.get_all_metrics().items()
            })
            if self.verbose:
                conll_metrics = per_entity_f1(gold_spans, pred_spans)
                conll_metrics['acc'] = total_acc * 100
                conll_metrics['tokens'] = total_sum
                logger.info(conlleval_output(conll_metrics))
        finally:
            if handle is not None:
                handle.close()

        return metrics
Example #2
0
def test_binary_mcc():
    Y_TRUE = [1, 1, 1, 0]
    Y_PRED = [1, 0, 1, 1]
    MCC_GOLD = -0.3333333333333333
    cm = ConfusionMatrix([0, 1])
    cm.add_batch(Y_TRUE, Y_PRED)
    np.testing.assert_allclose(cm.get_mcc(), MCC_GOLD, TOL)
Example #3
0
    def _test(self, ts, **kwargs):

        self.model.eval()
        total_sum = 0
        total_correct = 0

        gold_spans = []
        pred_spans = []
        cm = ConfusionMatrix(self.idx2classlabel)
        metrics = {}
        steps = len(ts)
        conll_output = kwargs.get('conll_output', None)
        txts = kwargs.get('txts', None)
        handle = None
        if conll_output is not None and txts is not None:
            handle = open(conll_output, "w")
        pg = create_progress_bar(steps)
        for batch_dict in pg(ts):

            inputs = self.model.make_input(batch_dict)
            y = inputs.pop('y')
            lengths = inputs['lengths']
            ids = inputs['ids']
            class_labels = inputs["class_label"]
            with torch.no_grad():
                class_pred, pred = self.model(inputs)
            correct, count, golds, guesses = self.process_output(
                pred, y.data, lengths, ids, handle, txts)
            total_correct += correct
            total_sum += count
            gold_spans.extend(golds)
            pred_spans.extend(guesses)
            _add_to_cm(cm, class_labels, class_pred)

        total_acc = total_correct / float(total_sum)
        metrics['tagging_acc'] = total_acc
        metrics['tagging_f1'] = span_f1(gold_spans, pred_spans)
        metrics.update({
            f"classification_{k}": v
            for k, v in cm.get_all_metrics().items()
        })
        if self.verbose:
            # TODO: Add programmatic access to these metrics?
            conll_metrics = per_entity_f1(gold_spans, pred_spans)
            conll_metrics['acc'] = total_acc * 100
            conll_metrics['tokens'] = total_sum.item()
            logger.info(conlleval_output(conll_metrics))
        return metrics
Example #4
0
    def _test(self, loader, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """
        if self.ema:
            self.sess.run(self.ema_load)

        use_dataset = kwargs.get('dataset', True)

        cm = ConfusionMatrix(self.model.labels)
        steps = len(loader)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)
        for i, batch_dict in enumerate(pg(loader)):
            y = batch_dict['y']
            if use_dataset:
                guess, lossv = self.sess.run([self.model.best, self.test_loss])
            else:
                feed_dict = self.model.make_input(batch_dict, False)
                guess, lossv = self.sess.run([self.model.best, self.test_loss],
                                             feed_dict=feed_dict)

            batchsz = len(guess)
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, guess)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics
Example #5
0
def test_mcc_inverse():
    golds = np.concatenate(
        (np.arange(2), np.random.randint(0, 2, size=np.random.randint(4,
                                                                      100))),
        axis=0)
    preds = 1 - golds
    cm = ConfusionMatrix.create(golds, preds)
    np.testing.assert_allclose(cm.get_mcc(), -1.0, TOL)
Example #6
0
 def test():
     C = np.random.randint(3, 11)
     golds = np.concatenate(
         (np.arange(C),
          np.random.randint(0, C, size=np.random.randint(4, 100))),
         axis=0)
     preds = np.random.randint(0, C, size=len(golds))
     cm = ConfusionMatrix.create(golds, preds)
     np.testing.assert_allclose(cm.get_r_k(), explicit_r_k(cm._cm), TOL)
Example #7
0
def test_r_k_inverse():
    """The worst value for R k ranges from -1 to 0 depending on the distribution of the true labels."""
    C = np.random.randint(2, 11)
    golds = np.concatenate(
        (np.arange(C), np.random.randint(0, C, size=np.random.randint(4,
                                                                      100))),
        axis=0)
    preds = golds + 1 % C
    cm = ConfusionMatrix.create(golds, preds)
    assert -1.0 <= cm.get_r_k() <= 0.0
Example #8
0
def test_r_k_perfect():
    """Perfect correlation results in a score of 1."""
    C = np.random.randint(2, 11)
    golds = np.concatenate(
        (np.arange(C), np.random.randint(0, C, size=np.random.randint(4,
                                                                      100))),
        axis=0)
    preds = np.copy(golds)
    cm = ConfusionMatrix.create(golds, preds)
    np.testing.assert_allclose(cm.get_r_k(), 1.0)
Example #9
0
 def test():
     golds = np.concatenate(
         (np.arange(2),
          np.random.randint(0, 2, size=np.random.randint(4, 100))),
         axis=0)
     preds = np.random.randint(0, 2, size=len(golds))
     cm = ConfusionMatrix.create(golds, preds)
     np.testing.assert_allclose(cm.get_mcc(), explicit_mcc(golds, preds),
                                TOL)
     np.testing.assert_allclose(cm.get_mcc(), author_mcc(cm._cm), TOL, TOL)
Example #10
0
    def _test(self, loader, **kwargs):
        self.model.eval()
        total_loss = 0
        total_norm = 0
        steps = len(loader)
        pg = create_progress_bar(steps)
        cm = ConfusionMatrix(self.labels)
        verbose = kwargs.get("verbose", None)
        output = kwargs.get('output')
        txts = kwargs.get('txts')
        handle = None
        line_number = 0
        if output is not None and txts is not None:
            handle = open(output, "w")

        with torch.no_grad():
            for batch_dict in pg(loader):
                example = self._make_input(batch_dict)
                ys = example.pop('y')
                pred = self.model(example)
                loss = self.crit(pred, ys)
                if handle is not None:
                    for p, y in zip(pred, ys):
                        handle.write('{}\t{}\t{}\n'.format(
                            " ".join(txts[line_number]), self.model.labels[p],
                            self.model.labels[y]))
                        line_number += 1
                batchsz = self._get_batchsz(batch_dict)
                total_loss += loss.item() * batchsz
                total_norm += batchsz
                _add_to_cm(cm, ys, pred)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)
        if handle is not None:
            handle.close()

        return metrics
Example #11
0
    def _train(self, loader, **kwargs):
        self.model.train()
        reporting_fns = kwargs.get('reporting_fns', [])
        steps = len(loader)
        pg = create_progress_bar(steps)
        cm = ConfusionMatrix(self.labels)
        epoch_loss = 0
        epoch_div = 0
        for batch_dict in pg(loader):
            self.optimizer.zero_grad()
            example = self._make_input(batch_dict)
            y = example.pop('y')
            pred = self.model(example)
            loss = self.crit(pred, y)
            batchsz = self._get_batchsz(batch_dict)
            report_loss = loss.item() * batchsz
            epoch_loss += report_loss
            epoch_div += batchsz
            self.nstep_agg += report_loss
            self.nstep_div += batchsz
            loss.backward()
            torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.clip)
            _add_to_cm(cm, y, pred)
            self.optimizer.step()

            if (self.optimizer.global_step + 1) % self.nsteps == 0:
                metrics = self.calc_metrics(self.nstep_agg, self.nstep_div)
                metrics['lr'] = self.optimizer.current_lr
                self.report(self.optimizer.global_step + 1, metrics,
                            self.nstep_start, 'Train', 'STEP', reporting_fns,
                            self.nsteps)
                self.reset_nstep()

        metrics = cm.get_all_metrics()
        metrics['lr'] = self.optimizer.current_lr

        metrics['avg_loss'] = epoch_loss / float(epoch_div)
        return metrics
Example #12
0
    def _test(self, loader, steps=0, **kwargs):
        """Test an epoch of data using either the input loader or using `tf.dataset`

        In non-`tf.dataset` mode, we cycle the loader data feed, and pull a batch and feed it to the feed dict
        When we use `tf.dataset`s under the hood, this function simply uses the loader to know how many steps
        to train.

        :param loader: A data feed
        :param kwargs: See below

        :Keyword Arguments:
          * *dataset* (`bool`) Set to `True` if using `tf.dataset`s, defaults to `True`
          * *reporting_fns* (`list`) A list of reporting hooks to use
          * *verbose* (`dict`) A dictionary containing `console` boolean and `file` name if on

        :return: Metrics
        """

        cm = ConfusionMatrix(self.model.labels)
        total_loss = 0
        total_norm = 0
        verbose = kwargs.get("verbose", None)

        pg = create_progress_bar(steps)

        SET_TRAIN_FLAG(False)
        for features, y in pg(loader):
            logits = self.model(features)
            y_ = tf.argmax(logits, axis=1, output_type=tf.int32)
            cm.add_batch(y, y_)
            lossv = tf.compat.v1.losses.sparse_softmax_cross_entropy(
                labels=y, logits=logits).numpy()
            batchsz = int(y.shape[0])
            assert len(y_) == batchsz
            total_loss += lossv * batchsz
            total_norm += batchsz
            cm.add_batch(y, y_)

        metrics = cm.get_all_metrics()
        metrics['avg_loss'] = total_loss / float(total_norm)
        verbose_output(verbose, cm)

        return metrics
Example #13
0
    return loss_value


for epoch in range(num_epochs):
    loss_acc = 0.
    step = 0
    start = time.time()
    for x, y in train_set.get_input(training=True):
        loss_value = train_step(optimizer, model, x, y)
        loss_acc += loss_value
        step += 1

    print('training time {}'.format(time.time() - start))
    mean_loss = loss_acc / step
    print('Training Loss {}'.format(mean_loss))
    cm = ConfusionMatrix(['0', '1'])
    for x, y in valid_set.get_input():
        y_ = np.argmax(to_device(model(x)), axis=1)
        cm.add_batch(y, y_)
    print(cm)
    print(cm.get_all_metrics())

print('FINAL')
cm = ConfusionMatrix(['0', '1'])
for x, y in test_set.get_input():
    y_ = tf.argmax(to_device(model(x)), axis=1, output_type=tf.int32)
    cm.add_batch(y, y_)

print(cm)
print(cm.get_all_metrics())
Example #14
0
def make_mc_cm():
    cm = ConfusionMatrix(LABELS)
    for y_t, y_p in zip(Y_TRUE, Y_PRED):
        cm.add(y_t, y_p)
    return cm
Example #15
0
def test_create_cm():
    gold = make_mc_cm()
    cm = ConfusionMatrix.create(Y_TRUE, Y_PRED)
    np.testing.assert_equal(gold._cm, cm._cm)
Example #16
0
def test_mcc_example():
    Y_TRUE = [1, 1, 1, 0]
    Y_PRED = [1, 0, 1, 1]
    MCC_GOLD = -0.3333333333333333
    cm = ConfusionMatrix.create(Y_TRUE, Y_PRED)
    np.testing.assert_allclose(cm.get_mcc(), MCC_GOLD, TOL)
Example #17
0
class JointTaggerEvaluatorEagerTf:
    """Performs evaluation on joint tagger and classifier output
    """
    def __init__(self, model, span_type, verbose):
        """Construct from an existing model

        :param model: A model
        :param span_type: (`str`) The span type
        :param verbose: (`bool`) Be verbose?
        """
        self.model = model

        self.idx2label = revlut(model.labels["tags"])
        self.idx2classlabel = revlut(model.labels["class_labels"])
        self.cm = None

        self.span_type = span_type
        if verbose:
            print('Setting span type {}'.format(self.span_type))
        self.verbose = verbose

    def process_batch(self, batch, truth, handle=None, txts=None, ids=None):
        class_guess_logits, guess = self.model(batch)
        sentence_lengths = batch['lengths']
        true_class_labels = batch['class_label']

        correct_labels = 0
        total_labels = 0

        # For fscore
        gold_chunks = []
        pred_chunks = []

        actual_class_labels = []
        predicted_class_labels = []
        # For each sentence
        for b in range(len(guess)):
            length = sentence_lengths[b]
            sentence = guess[b][:length].numpy()
            # truth[b] is padded, cutting at :length gives us back true length
            gold = truth[b][:length].numpy()
            actual_class_labels.append(true_class_labels[b].numpy())
            predicted_class_labels.append(
                tf.argmax(class_guess_logits[b], axis=0, output_type=tf.int32))

            valid_guess = sentence[gold != Offsets.PAD]
            valid_gold = gold[gold != Offsets.PAD]
            valid_sentence_length = np.sum(gold != Offsets.PAD)

            correct_labels += np.sum(np.equal(valid_guess, valid_gold))
            total_labels += valid_sentence_length

            gold_chunks.append(
                set(
                    to_spans(valid_gold, self.idx2label, self.span_type,
                             self.verbose)))
            pred_chunks.append(
                set(
                    to_spans(valid_guess, self.idx2label, self.span_type,
                             self.verbose)))

            if not (handle is None or txts is None):
                example_id = ids[b]
                example_txt = txts[example_id]
                write_sentence_conll(handle, valid_guess, valid_gold,
                                     example_txt, self.idx2label)

        self.cm.add_batch(actual_class_labels, predicted_class_labels)
        return correct_labels, total_labels, gold_chunks, pred_chunks

    def test(self, ts, steps=0, **kwargs):
        """Method that evaluates on some data.  There are 2 modes this can run in, `feed_dict` and `dataset`

        In `feed_dict` mode, the model cycles the test data batch-wise and feeds each batch in with a `feed_dict`.
        In `dataset` mode, the data is still passed in to this method, but it is not passed in a `feed_dict` and is
        mostly superfluous since the features are grafted right onto the graph.  However, we do use it for supplying
        the ground truth, ids and text, so it is essential that the caller does not shuffle the data
        :param ts: The test set
        :param conll_output: (`str`) An optional file output
        :param txts: A list of text data associated with the encoded batch
        :param dataset: (`bool`) Is this using `tf.dataset`s
        :return: The metrics
        """
        SET_TRAIN_FLAG(False)

        total_correct = total_sum = 0
        gold_spans = []
        pred_spans = []

        self.cm = ConfusionMatrix(self.idx2classlabel)

        handle = None
        if kwargs.get("conll_output") is not None and kwargs.get(
                'txts') is not None:
            handle = open(kwargs.get("conll_output"), "w")

        try:
            pg = create_progress_bar(steps)
            metrics = {}
            for (features, y), batch in pg(
                    zip_longest(ts, kwargs.get('batches', []), fillvalue={})):
                correct, count, golds, guesses = self.process_batch(
                    features,
                    y,
                    handle=handle,
                    txts=kwargs.get("txts"),
                    ids=batch.get("ids"))
                total_correct += correct
                total_sum += count
                gold_spans.extend(golds)
                pred_spans.extend(guesses)

            total_acc = total_correct / float(total_sum)
            # Only show the fscore if requested
            metrics['tagging_f1'] = span_f1(gold_spans, pred_spans)
            metrics['tagging_acc'] = total_acc
            metrics.update({
                f"classification_{k}": v
                for k, v in self.cm.get_all_metrics().items()
            })
            if self.verbose:
                conll_metrics = per_entity_f1(gold_spans, pred_spans)
                conll_metrics['acc'] = total_acc * 100
                conll_metrics['tokens'] = total_sum
                logger.info(conlleval_output(conll_metrics))
        finally:
            if handle is not None:
                handle.close()

        return metrics