Beispiel #1
0
    def get_features(all_sents,
                     tokenizer,
                     max_seq_length,
                     labels=None,
                     verbose=True):
        """Encode a list of sentences into a list of tuples of (input_ids, segment_ids, input_mask, label)."""
        features = []
        sent_lengths = []
        too_long_count = 0
        for sent_id, sent in enumerate(all_sents):
            if sent_id % 1000 == 0:
                logging.debug(f"Encoding sentence {sent_id}/{len(all_sents)}")
            sent_subtokens = [tokenizer.cls_token]
            for word in sent:
                word_tokens = tokenizer.text_to_tokens(word)
                sent_subtokens.extend(word_tokens)

            if max_seq_length > 0 and len(sent_subtokens) + 1 > max_seq_length:
                sent_subtokens = sent_subtokens[:max_seq_length]
                too_long_count += 1

            sent_subtokens.append(tokenizer.sep_token)
            sent_lengths.append(len(sent_subtokens))

            input_ids = [tokenizer.tokens_to_ids(t) for t in sent_subtokens]

            # The mask has 1 for real tokens and 0 for padding tokens.
            # Only real tokens are attended to.
            input_mask = [1] * len(input_ids)
            segment_ids = [0] * len(input_ids)

            if verbose and sent_id < 2:
                logging.info("*** Example ***")
                logging.info(f"example {sent_id}: {sent}")
                logging.info("subtokens: %s" % " ".join(sent_subtokens))
                logging.info("input_ids: %s" % list2str(input_ids))
                logging.info("segment_ids: %s" % list2str(segment_ids))
                logging.info("input_mask: %s" % list2str(input_mask))
                logging.info("label: %s" %
                             labels[sent_id] if labels else "**Not Provided**")

            label = labels[sent_id] if labels else -1
            features.append([
                np.asarray(input_ids),
                np.asarray(segment_ids),
                np.asarray(input_mask), label
            ])

        if max_seq_length > -1 and too_long_count > 0:
            logging.warning(
                f'Found {too_long_count} out of {len(all_sents)} sentences with more than {max_seq_length} subtokens. '
                f'Truncated long sentences from the end.')
        if verbose:
            get_stats(sent_lengths)
        return features
Beispiel #2
0
    def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
        """
        Called at the end of validation to aggregate outputs.
        outputs: list of individual outputs of each validation step.
        """
        avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        preds = torch.cat([x['eval_tensors']['preds'] for x in outputs])
        labels = torch.cat([x['eval_tensors']['labels'] for x in outputs])

        all_preds = []
        all_labels = []
        if torch.distributed.is_initialized():
            world_size = torch.distributed.get_world_size()
            for ind in range(world_size):
                all_preds.append(torch.empty_like(preds))
                all_labels.append(torch.empty_like(labels))
            torch.distributed.all_gather(all_preds, preds)
            torch.distributed.all_gather(all_labels, labels)
        else:
            all_preds.append(preds)
            all_labels.append(labels)

        tensorboard_logs = {}
        if not torch.distributed.is_initialized(
        ) or torch.distributed.get_rank() == 0:
            preds = []
            labels = []
            for p in all_preds:
                preds.extend(tensor2list(p))
            for l in all_labels:
                labels.extend(tensor2list(l))

            tensorboard_logs = compute_metrics(self.task_name, np.array(preds),
                                               np.array(labels))
            val_name = self._validation_names[dataloader_idx].upper()
            logging.info(f'{val_name} evaluation: {tensorboard_logs}')

            # writing labels and predictions to a file in output_dir is specified in the config
            output_dir = self._cfg.output_dir
            if output_dir:
                os.makedirs(output_dir, exist_ok=True)
                filename = os.path.join(output_dir,
                                        f'{self.task_name}_{val_name}.txt')
                logging.info(f'Saving labels and predictions to {filename}')
                with open(filename, 'w') as f:
                    f.write('labels\t' + list2str(labels) + '\n')
                    f.write('preds\t' + list2str(preds) + '\n')

        tensorboard_logs['val_loss'] = avg_loss
        for key in tensorboard_logs:
            self.log(f'{key}', tensorboard_logs[key], prog_bar=True)

        return {'val_loss': avg_loss, 'log': tensorboard_logs}