Esempio n. 1
0
    def evaluate_predictions(self, logits, label_ids):
        """
        Run evaluation of given logist and truth labels

        Args:
            logits: model logits
            label_ids: truth label ids
        """
        active_positions = label_ids.view(-1) != 0.0
        active_labels = label_ids.view(-1)[active_positions]
        active_logits = logits.view(-1, len(self.labels_id_map) + 1)[active_positions]
        logits = torch.argmax(F.log_softmax(active_logits, dim=1), dim=1)
        logits = logits.detach().cpu().numpy()
        out_label_ids = active_labels.detach().cpu().numpy()
        _, _, f1 = self.extract_labels(out_label_ids, self.labels_id_map, logits)
        logger.info("Evaluation on set = F1: {}".format(f1))
Esempio n. 2
0
    def _convert_examples_to_features(self,
                                      examples: List[TokenClsInputExample],
                                      max_seq_length,
                                      tokenizer,
                                      include_labels=True,
                                      cls_token_at_end=False,
                                      pad_on_left=False,
                                      cls_token='[CLS]',
                                      sep_token='[SEP]',
                                      pad_token=0,
                                      sequence_segment_id=0,
                                      cls_token_segment_id=1,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {v: k for k, v in self.labels_id_map.items()}
            label_pad = 0

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Processing example %d of %d" %
                            (ex_index, len(examples)))

            tokens = []
            labels = []
            valid_tokens = []
            for i, token in enumerate(example.tokens):
                new_tokens = tokenizer.tokenize(token)
                tokens.extend(new_tokens)
                v_tok = [0] * (len(new_tokens))
                v_tok[0] = 1
                valid_tokens.extend(v_tok)
                if include_labels:
                    v_lbl = [label_pad] * (len(new_tokens))
                    v_lbl[0] = label_map.get(example.label[i])
                    labels.extend(v_lbl)

            # truncate by max_seq_length
            tokens = tokens[:(max_seq_length - 2)]
            if include_labels:
                labels = labels[:(max_seq_length - 2)]
            valid_tokens = valid_tokens[:(max_seq_length - 2)]

            # The convention in BERT is:
            # (a) For sequence pairs:
            #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
            #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
            # (b) For single sequences:
            #  tokens:   [CLS] the dog is hairy . [SEP]
            #  type_ids:   0   0   0   0  0     0   0
            #
            # Where "type_ids" are used to indicate whether this is the first
            # sequence or the second sequence. The embedding vectors for `type=0` and
            # `type=1` were learned during pre-training and are added to the wordpiece
            # embedding vector (and position vector). This is not *strictly* necessary
            # since the [SEP] token unambiguously separates the sequences, but it makes
            # it easier for the model to learn the concept of sequences.
            #
            # For classification tasks, the first vector (corresponding to [CLS]) is
            # used as as the "sentence vector". Note that this only makes sense because
            # the entire model is fine-tuned.
            tokens = tokens + [sep_token]
            if include_labels:
                labels = labels + [label_pad]
            valid_tokens = valid_tokens + [0]
            segment_ids = [sequence_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens = tokens + [cls_token]
                segment_ids = segment_ids + [cls_token_segment_id]
                if include_labels:
                    labels = labels + [label_pad]
                valid_tokens = valid_tokens + [0]
            else:
                tokens = [cls_token] + tokens
                segment_ids = [cls_token_segment_id] + segment_ids
                if include_labels:
                    labels = [label_pad] + labels
                valid_tokens = [0] + valid_tokens

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
                if include_labels:
                    labels = ([label_pad] * padding_length) + labels
                valid_tokens = ([0] * padding_length) + valid_tokens
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                input_mask = input_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                segment_ids = segment_ids + ([pad_token_segment_id] *
                                             padding_length)
                if include_labels:
                    labels = labels + ([label_pad] * padding_length)
                valid_tokens = valid_tokens + ([0] * padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(valid_tokens) == max_seq_length
            if include_labels:
                assert len(labels) == max_seq_length

            features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=labels,
                              valid_ids=valid_tokens))
        return features
    def _convert_examples_to_features(self,
                                      examples: List[TokenClsInputExample],
                                      max_seq_length,
                                      tokenizer,
                                      include_labels=True,
                                      cls_token_at_end=False,
                                      pad_on_left=False,
                                      cls_token='[CLS]',
                                      sep_token='[SEP]',
                                      pad_token=0,
                                      sequence_segment_id=0,
                                      sep_token_extra=0,
                                      cls_token_segment_id=1,
                                      pad_token_segment_id=0,
                                      mask_padding_with_zero=True):
        """ Loads a data file into a list of `InputBatch`s
            `cls_token_at_end` define the location of the CLS token:
                - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
                - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
            `cls_token_segment_id` define the segment id associated to the CLS token
            (0 for BERT, 2 for XLNet)
        """

        if include_labels:
            label_map = {v: k for k, v in self.labels_id_map.items()}
            label_pad = 0

        features = []
        for (ex_index, example) in enumerate(examples):
            if ex_index % 10000 == 0:
                logger.info("Processing example %d of %d" %
                            (ex_index, len(examples)))

            tokens = []
            labels = []
            valid_tokens = []
            for i, token in enumerate(example.tokens):
                new_tokens = tokenizer.tokenize(token)
                tokens.extend(new_tokens)
                v_tok = [0] * (len(new_tokens))
                v_tok[0] = 1
                valid_tokens.extend(v_tok)
                if include_labels:
                    v_lbl = [label_pad] * (len(new_tokens))
                    v_lbl[0] = label_map.get(example.label[i])
                    labels.extend(v_lbl)

            # truncate by max_seq_length
            special_tokens_count = 3 if sep_token_extra else 2
            tokens = tokens[:(max_seq_length - special_tokens_count)]
            valid_tokens = valid_tokens[:(max_seq_length -
                                          special_tokens_count)]
            if include_labels:
                labels = labels[:(max_seq_length - special_tokens_count)]

            tokens += [sep_token]
            if include_labels:
                labels += [label_pad]
            valid_tokens += [0]
            if sep_token_extra:  # roberta special case
                tokens += [sep_token]
                valid_tokens += [0]
                if include_labels:
                    labels += [label_pad]
            segment_ids = [sequence_segment_id] * len(tokens)

            if cls_token_at_end:
                tokens = tokens + [cls_token]
                segment_ids = segment_ids + [cls_token_segment_id]
                if include_labels:
                    labels = labels + [label_pad]
                valid_tokens = valid_tokens + [0]
            else:
                tokens = [cls_token] + tokens
                segment_ids = [cls_token_segment_id] + segment_ids
                if include_labels:
                    labels = [label_pad] + labels
                valid_tokens = [0] + valid_tokens

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            padding_length = max_seq_length - len(input_ids)
            if pad_on_left:
                input_ids = ([pad_token] * padding_length) + input_ids
                input_mask = ([0 if mask_padding_with_zero else 1] *
                              padding_length) + input_mask
                segment_ids = ([pad_token_segment_id] *
                               padding_length) + segment_ids
                if include_labels:
                    labels = ([label_pad] * padding_length) + labels
                valid_tokens = ([0] * padding_length) + valid_tokens
            else:
                input_ids = input_ids + ([pad_token] * padding_length)
                input_mask = input_mask + (
                    [0 if mask_padding_with_zero else 1] * padding_length)
                segment_ids = segment_ids + ([pad_token_segment_id] *
                                             padding_length)
                if include_labels:
                    labels = labels + ([label_pad] * padding_length)
                valid_tokens = valid_tokens + ([0] * padding_length)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length
            assert len(valid_tokens) == max_seq_length
            if include_labels:
                assert len(labels) == max_seq_length

            features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=labels,
                              valid_ids=valid_tokens))
        return features