def evaluate_predictions(self, logits, label_ids): """ Run evaluation of given logist and truth labels Args: logits: model logits label_ids: truth label ids """ active_positions = label_ids.view(-1) != 0.0 active_labels = label_ids.view(-1)[active_positions] active_logits = logits.view(-1, len(self.labels_id_map) + 1)[active_positions] logits = torch.argmax(F.log_softmax(active_logits, dim=1), dim=1) logits = logits.detach().cpu().numpy() out_label_ids = active_labels.detach().cpu().numpy() _, _, f1 = self.extract_labels(out_label_ids, self.labels_id_map, logits) logger.info("Evaluation on set = F1: {}".format(f1))
def _convert_examples_to_features(self, examples: List[TokenClsInputExample], max_seq_length, tokenizer, include_labels=True, cls_token_at_end=False, pad_on_left=False, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_segment_id=0, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {v: k for k, v in self.labels_id_map.items()} label_pad = 0 features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Processing example %d of %d" % (ex_index, len(examples))) tokens = [] labels = [] valid_tokens = [] for i, token in enumerate(example.tokens): new_tokens = tokenizer.tokenize(token) tokens.extend(new_tokens) v_tok = [0] * (len(new_tokens)) v_tok[0] = 1 valid_tokens.extend(v_tok) if include_labels: v_lbl = [label_pad] * (len(new_tokens)) v_lbl[0] = label_map.get(example.label[i]) labels.extend(v_lbl) # truncate by max_seq_length tokens = tokens[:(max_seq_length - 2)] if include_labels: labels = labels[:(max_seq_length - 2)] valid_tokens = valid_tokens[:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = tokens + [sep_token] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] segment_ids = [sequence_segment_id] * len(tokens) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids if include_labels: labels = [label_pad] + labels valid_tokens = [0] + valid_tokens input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids if include_labels: labels = ([label_pad] * padding_length) + labels valid_tokens = ([0] * padding_length) + valid_tokens else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) if include_labels: labels = labels + ([label_pad] * padding_length) valid_tokens = valid_tokens + ([0] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(valid_tokens) == max_seq_length if include_labels: assert len(labels) == max_seq_length features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=labels, valid_ids=valid_tokens)) return features
def _convert_examples_to_features(self, examples: List[TokenClsInputExample], max_seq_length, tokenizer, include_labels=True, cls_token_at_end=False, pad_on_left=False, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_segment_id=0, sep_token_extra=0, cls_token_segment_id=1, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of `InputBatch`s `cls_token_at_end` define the location of the CLS token: - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) """ if include_labels: label_map = {v: k for k, v in self.labels_id_map.items()} label_pad = 0 features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Processing example %d of %d" % (ex_index, len(examples))) tokens = [] labels = [] valid_tokens = [] for i, token in enumerate(example.tokens): new_tokens = tokenizer.tokenize(token) tokens.extend(new_tokens) v_tok = [0] * (len(new_tokens)) v_tok[0] = 1 valid_tokens.extend(v_tok) if include_labels: v_lbl = [label_pad] * (len(new_tokens)) v_lbl[0] = label_map.get(example.label[i]) labels.extend(v_lbl) # truncate by max_seq_length special_tokens_count = 3 if sep_token_extra else 2 tokens = tokens[:(max_seq_length - special_tokens_count)] valid_tokens = valid_tokens[:(max_seq_length - special_tokens_count)] if include_labels: labels = labels[:(max_seq_length - special_tokens_count)] tokens += [sep_token] if include_labels: labels += [label_pad] valid_tokens += [0] if sep_token_extra: # roberta special case tokens += [sep_token] valid_tokens += [0] if include_labels: labels += [label_pad] segment_ids = [sequence_segment_id] * len(tokens) if cls_token_at_end: tokens = tokens + [cls_token] segment_ids = segment_ids + [cls_token_segment_id] if include_labels: labels = labels + [label_pad] valid_tokens = valid_tokens + [0] else: tokens = [cls_token] + tokens segment_ids = [cls_token_segment_id] + segment_ids if include_labels: labels = [label_pad] + labels valid_tokens = [0] + valid_tokens input_ids = tokenizer.convert_tokens_to_ids(tokens) input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_seq_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids if include_labels: labels = ([label_pad] * padding_length) + labels valid_tokens = ([0] * padding_length) + valid_tokens else: input_ids = input_ids + ([pad_token] * padding_length) input_mask = input_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) if include_labels: labels = labels + ([label_pad] * padding_length) valid_tokens = valid_tokens + ([0] * padding_length) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length assert len(valid_tokens) == max_seq_length if include_labels: assert len(labels) == max_seq_length features.append( InputFeatures(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=labels, valid_ids=valid_tokens)) return features