def __call__(self, texts_a: List[str]): input_features = [] tokens = [] mask_idxs = [] for text_a in texts_a: encoded_dict = self.tokenizer.encode_plus( text=text_a, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') curr_features = InputFeatures( input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features.append(curr_features) if self.return_tokens: tokens.append( self.tokenizer.convert_ids_to_tokens( encoded_dict['input_ids'][0])) tokens = self.tokenizer.convert_ids_to_tokens( encoded_dict['input_ids'][0]) mask_idx = 0 for i in range(len(tokens)): if tokens[i] == '[MASK]': mask_idx = i mask_idxs.append(mask_idx) if self.return_tokens: return input_features, tokens, mask_idxs else: return input_features, mask_idxs
def convert_one_example_to_features(examples, tokenizer, max_length=512, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): features = [] for (ex_index, example) in enumerate(examples): inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, truncate_first_sequence=False # We're truncating the first sequence in priority ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=None)) return features
def convert_examples_to_features(examples, tokenizer, max_length=512, label2id=None): logger.info("正在创建 features") features = [] for (ex_index, example) in tqdm(enumerate(examples)): inputs = tokenizer.encode_plus(example.text_a, add_special_tokens=True, max_length=max_length, pad_to_max_length=True, truncation="longest_first") input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = inputs['attention_mask'] input_len, att_mask_len, token_type_len = len(input_ids), len( attention_mask), len(token_type_ids) assert input_len == max_length, "input_ids 长度错误 {} vs {}".format( input_len, max_length) assert att_mask_len == max_length, "att_mask 长度错误 {} vs {}".format( att_mask_len, max_length) assert token_type_len == max_length, "token_type_ids 长度错误 {} vs {}".format( token_type_len, max_length) label = label2id[example.label] features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) return features
def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] # batch_encoding = tokenizer( # [(example.text_a, example.text_b) for example in examples], # max_length=max_length, # padding="max_length", # truncation=True, # ) features = [] for i in range(len(examples)): # inputs = {k: batch_encoding[k][i] for k in batch_encoding} inputs = tokenizer.encode_plus(text=examples[i].text_a.split(" "), text_pair=examples[i].text_b.split(" ") if examples[i].text_b else None, max_length=max_length, padding="max_length", truncation=True) feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def create_input_feature(tokenizer, output_mode, example, max_length, mask_padding_with_zero, pad_on_left, pad_token, pad_token_segment_id, label_map): example = InputExample( example['id'], example['sentence1'], example['sentence2'] if 'sentence2' in example else None, example['label']) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, truncation_strategy= 'only_first' # We're truncating the first sequence in priority ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len( input_ids) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) return InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)
def _convert_examples_to_features(self, examples): max_length = self.max_length pad_token = self.pad_token pad_token_segment_id = self.pad_token_segment_id mask_padding_with_zero = self.mask_padding_with_zero label_map = self.get_label_map() features = [] for (ex_index, example) in enumerate(examples): len_examples = len(examples) inputs = self.tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if self.pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + \ ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + \ ([pad_token_segment_id] * padding_length) assert len( input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len( attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len( token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) label = label_map[example.label] features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) return features
def convert_examples_to_features(examples, tokenizer, label_map, max_length=512, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): features = [] for (ex_index, example) in enumerate(examples): len_examples = len(examples) if ex_index % 100 == 0: logging.info("converting example %d/%d" % (ex_index, len_examples)) inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + \ ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + \ ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) label = label_map[example.label] features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) return features
def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> Union[List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ if texts_b is None: texts_b = [None] * len(texts_a) input_features = [] tokens = [] for text_a, text_b in zip(texts_a, texts_b): encoded_dict = self.tokenizer.encode_plus( text=text_a, text_pair=text_b, add_special_tokens=True, max_length=self.max_seq_length, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt') if 'token_type_ids' not in encoded_dict: if self.add_token_type_ids: input_ids = encoded_dict['input_ids'] seq_len = input_ids.size(1) sep = torch.where(input_ids == self.tokenizer.sep_token_id)[1][0].item() len_a = min(sep + 1, seq_len) len_b = seq_len - len_a encoded_dict['token_type_ids'] = torch.cat((torch.zeros(1, len_a, dtype=int), torch.ones(1, len_b, dtype=int)), dim=1) else: encoded_dict['token_type_ids'] = torch.tensor([0]) curr_features = InputFeatures(input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features.append(curr_features) if self.return_tokens: tokens.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])) if self.return_tokens: return input_features, tokens else: return input_features
def convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len processor = THUCNewsProcessor() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = "classification" logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) labels = [label_from_example(example) for example in examples] batch_encoding = tokenizer( [(example.text_a, example.text_b) for example in examples], max_length=max_length, padding="max_length", truncation=True, ) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} # https://github.com/huggingface/transformers/blob/master/src/transformers/data/processors/utils.py#L56 # InputFeatures当中包含了input_ids, attention_mask, token_type_ids和label四个部分 feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) for i, example in enumerate(examples[:5]): logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def __getitem__(self, index): if self.cache_mode == 'pickle': return self.instances[index] elif self.cache_mode == 'memmap': instance_data = {} for k in self.mem_maps: if k != "labels": instance_data[k] = list(self.mem_maps[k][index]) inputs = {k: instance_data[k] for k in instance_data} feature = InputFeatures(**inputs, label=float(self.mem_maps["labels"][index])) return feature
def _cache_instances_pickle(self): """ Loads tensors into memory or creates the dataset when it does not exist already. """ signature = "weakly_supervised_pointwise_set_{}_n_cand_docs_{}_ns_sampler_{}_seq_max_l_{}_sample_{}_for_{}_using_{}".\ format(self.data_partition, self.negative_sampler.num_candidates_samples, self.negative_sampler.name, self.max_seq_len, self.sample_data, self.task_type, self.tokenizer.__class__.__name__) path = self.cache_path + "/" + signature if os.path.exists(path): with open(path, 'rb') as f: logging.info("Loading instances from {}".format(path)) self.instances = pickle.load(f) else: logging.info("Generating instances with signature {}".format(signature)) labels = [] examples = [] for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))): query = row[0] relevant_documents = row[1] for relevant_document in relevant_documents: examples.append((query, relevant_document)) labels.append(1.0) ns_candidates, ns_scores, _, _, _ = self.negative_sampler.sample(query, relevant_documents) for i, ns in enumerate(ns_candidates): examples.append((query, ns)) labels.append(ns_scores[i]) logging.info("Encoding examples using tokenizer.batch_encode_plus().") batch_encoding = self.tokenizer(examples, max_length=self.max_seq_len, padding="max_length", truncation=True) logging.info("Transforming examples to instances format.") self.instances = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=labels[i]) self.instances.append(feature) for idx in range(3): logging.info("Set {} Instance {} query \n\n{}[...]\n".format(self.data_partition, idx, examples[idx][0][0:200])) logging.info("Set {} Instance {} document \n\n{}\n".format(self.data_partition, idx, examples[idx][1][0:200])) logging.info("Set {} Instance {} features \n\n{}\n".format(self.data_partition, idx, self.instances[idx])) with open(path, 'wb') as f: pickle.dump(self.instances, f) logging.info("Total of {} instances were cached.".format(len(self.instances)))
def roberta_convert_examples_to_tf_dataset(examples, tokenizer, tagset, max_length): features = [] # -> will hold InputFeatures to be converted later for e in examples: tokens = e["tokens"] labels = e["tags"] label_map = {label: i for i, label in enumerate(tagset)} # Tags to indexes # Tokenize subwords and propagate labels split_tokens, split_labels, idx_map = tokenizer.subword_tokenize( tokens, labels) # Create features input_ids = tokenizer.convert_tokens_to_ids(split_tokens) attention_mask = [1] * len(input_ids) label_ids = [label_map[label] for label in split_labels] padding = [0] * (max_length - len(input_ids)) input_ids += padding attention_mask += padding label_ids += padding features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label_ids)) def gen(): for f in features: yield ( { "input_ids": f.input_ids, "attention_mask": f.attention_mask, }, f.label, ) return tf.data.Dataset.from_generator( gen, ({ "input_ids": tf.int32, "attention_mask": tf.int32 }, tf.int64), ( { "input_ids": tf.TensorShape([None]), "attention_mask": tf.TensorShape([None]), }, tf.TensorShape([None]), ), )
def _glue_convert_examples_to_features( examples: List[InputExample], tokenizer: PreTrainedTokenizer, max_length: Optional[int] = None, task=None, label_list=None, output_mode=None, ): if max_length is None: max_length = tokenizer.max_len if task is not None: processor = glue_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = glue_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) #label 字符串到id的映射表 label_map = {label: i for i, label in enumerate(label_list)} def label_from_example(example: InputExample) -> Union[int, float, None]: if example.label is None: return None if output_mode == "classification": return label_map[example.label] elif output_mode == "regression": return float(example.label) raise KeyError(output_mode) #获取所有样本的labels labels = [label_from_example(example) for example in examples] #所有样本字符到id的,padding或traucate 后的结果 batch_encoding = tokenizer( [(example.text_a, example.text_b) for example in examples], max_length=max_length, padding="max_length", truncation=True, ) #把input_ids, attention_mask, token_type_ids, label 放到一个对象InputFeatures 里面 features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} # 把input_ids, attention_mask, token_type_ids, label 放到一个对象InputFeatures 里面 feature = InputFeatures(**inputs, label=labels[i]) features.append(feature) #打印前5个样本 logger.info("*** 打印前5个样本 ***") for i, example in enumerate(examples[:5]): logger.info("guid: %s" % (example.guid)) logger.info("features: %s" % features[i]) return features
def bert_attribute_accuracy(targets, predictions, classifier_model, tokenizer, device, attributes_origin=None, batch_size=32): batch_encoding = tokenizer.batch_encode_plus(predictions, max_length=tokenizer.max_len, pad_to_max_length=True) features = [] for i in range(len(predictions)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs) features.append(feature) all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) # Data on TPU all_input_ids = all_input_ids.to(device) all_attention_mask = all_attention_mask.to(device) all_token_type_ids = all_token_type_ids.to(device) classifier_model.eval() with torch.no_grad(): inputs = { "input_ids": all_input_ids, "attention_mask": all_attention_mask, "token_type_ids": all_token_type_ids } prediction_labels = torch.round( torch.sigmoid(classifier_model(**inputs)[0].squeeze(1))) prediction_labels = prediction_labels.detach().cpu().numpy() attributes_origin = np.array(attributes_origin) correct = (prediction_labels != attributes_origin).float() attribute_accuracy = correct.sum() / len(correct) return {"attribute_accuracy": attribute_accuracy}
def __call__(self, batch: List[List[str]]) -> List[List[InputFeatures]]: """Tokenize and create masks. Args: batch: list of elemenents where the first element represents the batch with contexts and the rest of elements represent response candidates batches Returns: list of feature batches with subtokens, subtoken ids, subtoken mask, segment mask. """ if isinstance(batch[0], str): batch = [batch] cont_resp_pairs = [] if len(batch[0]) == 1: contexts = batch[0] responses_empt = [None] * len(batch) cont_resp_pairs.append(zip(contexts, responses_empt)) else: contexts = [el[0] for el in batch] for i in range(1, len(batch[0])): responses = [] for el in batch: responses.append(el[i]) cont_resp_pairs.append(zip(contexts, responses)) input_features = [] for s in cont_resp_pairs: sub_list_features = [] for context, response in s: encoded_dict = self.tokenizer.encode_plus( text=context, text_pair=response, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') curr_features = InputFeatures( input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) sub_list_features.append(curr_features) input_features.append(sub_list_features) return input_features
def __getitem__(self, item): tweet = self.tweets[item] tokens = self.tokenizer.encode_plus( tweet, add_special_tokens=True, max_length=self.max_len, return_token_type_ids=False, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt', truncation=True ) return InputFeatures(input_ids=tokens['input_ids'].flatten().long().numpy().tolist(), attention_mask=tokens['attention_mask'].flatten().long().numpy().tolist())
def encode_sentences(df, with_context, with_section_names): """ Encodes a list of sentences into BERT tokens and returns a list of feature-label combinations. InputFeatures contains input_ids, attention_masks, token_type_ids and the label. :param df: a data frame of features and labels. :param with_context: True, if the context (pre and post sentence) should be considered. :param with_section_names: True, if the section name should be considered. :return: a list of InputFeatures where each element is a feature-label combination. """ features = [] for entry in df.iterrows(): sentence = get_context(entry) if with_context else entry[1]['sentence'] if with_section_names: sentence = entry[1]['section_name'] + " " + sentence inputs = encode_sentence(sentence) features.append(InputFeatures(**inputs, label=int(entry[1]['used']))) return features
def convert_single_example_to_features(example, tokenizer, max_length=512, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): feature = [] inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, truncate_first_sequence=True # We're truncating the first sequence in priority ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) # logger.info("*** Example ***") # logger.info("guid: %s" % (example.guid)) # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) # logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) # logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) feature=InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=None) return feature
def _input_fn(texts, labels): features = [] for text, label in zip(texts, labels): # docs are saying the add_prefix_space should be used inputs = TOKENIZER.encode_plus(text, None, add_special_tokens=True, max_length=DEFAULT_MAX_LEN, add_prefix_space=True) input_ids = inputs["input_ids"] attention_mask = [1] * len(input_ids) input_ids = _pad_with(input_ids, PAD_TOKEN, DEFAULT_MAX_LEN) attention_mask = _pad_with(attention_mask, PAD_TOKEN, DEFAULT_MAX_LEN) assert (len(input_ids) == DEFAULT_MAX_LEN) assert (len(attention_mask) == DEFAULT_MAX_LEN) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, label=label)) def gen(): for f in features: yield ({ 'input_ids': f.input_ids, 'attention_mask': f.attention_mask }, f.label) return tf.data.Dataset.from_generator(gen, ({ 'input_ids': tf.int32, 'attention_mask': tf.int32 }, tf.float32), ({ 'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]) }, tf.TensorShape([])))
def __call__(self, texts_a: List[str], texts_b: Optional[List[str]] = None) -> Union[ List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ if texts_b is None: texts_b = [None] * len(texts_a) input_features = [] tokens = [] for text_a, text_b in zip(texts_a, texts_b): encoded_dict = self.tokenizer.encode_plus( text=text_a, text_pair=text_b, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') curr_features = InputFeatures(input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features.append(curr_features) if self.return_tokens: tokens.append(self.tokenizer.convert_ids_to_tokens(encoded_dict['input_ids'][0])) if self.return_tokens: return input_features, tokens else: return input_features
def return_answers(self, question, search_name=None, min_score=None, max_length=128): """ Searches texts for sentences that answer a question. Texts from the results of a specified search (or the whole corpus if no `search_name` is given) are split into sentences, and each sentence is scored based on the likelihood it answers the `question` parameter provided - the higher the score, the more likely the sentence contains an answer to the given question. Results are returned as tuples in the following format: (text id, sentence number, sentence text, score) Parameters ---------- question: str Question against which sentences are scored search_name: str, optional Name of the search to take results from. If none, the whole text corpus in the `texts` attribute is used min_score: int, optional The minimum score a sentence must receive to be returned in the output max_length: int, default 128 The length of a sentence in tokens used by the Bert model to set the fixed-length input to the model Returns ------- tuple: (str: text_id, str: sentence no, str: sentence text, float: score) """ if not search_name is None: search_texts_ids = self.search_results[search_name].ids else: search_texts_ids = self.texts.keys() print('=' * 100) print(f"Checking {len(search_texts_ids)} search results " f"for answers to {question}") # collect texts that correspond with ids from search # and create (sentence, text_id) tuples search_texts = [self.texts[text_id] for text_id in search_texts_ids] sentence_tuples = self._split_text_to_sentences(search_texts_ids, search_texts) # create input examples with question # and sentence (potential answer) pairs input_examples = [] for sentence_tuple in sentence_tuples: text_id, sentence_no, sentence = sentence_tuple input_example = InputExample( guid = str(text_id) + '_' + str(sentence_no), text_a = question, text_b = sentence ) input_examples.append(input_example) print("Inputs converted to BERT InputExamples") # take input examples and convert to input features with padding input_features = [] for idx, example in enumerate(input_examples): inputs = self.tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length ) input_ids = inputs["input_ids"] token_type_ids = inputs["token_type_ids"] attention_mask = [1] * len(input_ids) padding_length = max_length - len(input_ids) pad_token = self.tokenizer.convert_tokens_to_ids( [self.tokenizer.pad_token])[0] input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0] * padding_length) token_type_ids = token_type_ids + ([0] * padding_length) input_features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=None) ) print("InputExamples converted to InputFeatures") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in input_features], dtype=torch.long) all_attention_mask = torch.tensor([f.attention_mask for f in input_features], dtype=torch.long) all_token_type_ids = torch.tensor([f.token_type_ids for f in input_features], dtype=torch.long) tensor_dataset = TensorDataset(all_input_ids, all_attention_mask, all_token_type_ids) print("InputFeatures converted to TensorDataset") # create dataloader to feed batches to torch model sampler = SequentialSampler(tensor_dataset) dataloader = DataLoader(tensor_dataset, sampler=sampler, batch_size=100) print("TensorDataset converted to torch DataLoader") print(f"Ranking {len(sentence_tuples)} possible answers from " f"{len(search_texts)} texts:", flush=True) # feed data to model and output logits # i.e. [likelihood not answer, likelihood answer] all_logits = [] with torch.no_grad(): for batch in tqdm(dataloader, total=len(dataloader)): model_input = tuple(tensor.to(self.device) for tensor in batch) inputs = {'input_ids': model_input[0], 'attention_mask': model_input[1]} batch_logits = self.model(**inputs)[0] if len(all_logits): all_logits = np.concatenate([all_logits, batch_logits.cpu()]) else: all_logits = np.array(batch_logits.cpu()) answer_score = all_logits[:,1] - all_logits[:,0] ranked_answers = answer_score.argsort()[::-1] answer_tuples = [] for answer_idx in ranked_answers: if min_score is not None: if answer_score[answer_idx] < min_score: break text_id, sentence_no, sentence = sentence_tuples[answer_idx] answer_tuples.append((text_id, sentence_no, sentence, answer_score[answer_idx])) return answer_tuples
def run_test(args): data = pd.read_csv(args.test_path, sep='\t') question_bank = pd.read_csv("%s/question_bank.tsv" % args.data_dir, sep="\t") all_documents = list(question_bank["question"].values[1:]) examples = [] for tid in data['topic_id'].unique(): query = data.loc[data['topic_id']==tid, 'initial_request'].tolist()[0] for doc in all_documents: examples.append((query, doc)) tokenizer = BertTokenizer.from_pretrained("%s/vocab.txt" % args.log_dir) batch_encoding = tokenizer.batch_encode_plus( examples, max_length=args.max_seq_len, truncation=True, pad_to_max_length=True) features = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} feature = InputFeatures(**inputs, label=0) features.append(feature) dataset = SimpleDataset(features) data_collator = DefaultDataCollator() dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, collate_fn=data_collator.collate_batch) # load fine-tuned model model = BertForSequenceClassification.from_pretrained(args.log_dir) ranker = transformer_ranker.TransformerRanker( model=model, train_loader=None, val_loader=None, test_loader=None, num_ns_eval=None, task_type="classification", tokenizer=tokenizer, validate_every_epochs=1, num_validation_instances=-1, num_epochs=args.num_epochs, lr=args.lr, sacred_ex=None) _, _, softmax_output = ranker.predict(dataloader) softmax_output_by_query = utils.acumulate_list(softmax_output[0], len(all_documents)) # save output if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if "dev" in args.test_path: run_file_path = "%s/dev_ranked_q.txt" % args.output_dir else: run_file_path = "%s/test_ranked_q.txt" % args.output_dir all_doc_ids = np.array(question_bank["question_id"].values[1:]) with open(run_file_path, 'w') as fo: for tid_idx, tid in enumerate(data['topic_id'].unique()): all_documents_scores = np.array(softmax_output_by_query[tid_idx]) print("tid:", tid) top_30_scores_idx = (-all_documents_scores).argsort()[:30] preds_score = list(all_documents_scores[top_30_scores_idx]) preds = list(all_doc_ids[top_30_scores_idx]) #print("softmax_score:", preds_score) #print("preds:", preds) #query = data.loc[data['topic_id']==tid, 'initial_request'].tolist()[0] #best_q = get_best_q(query, question_bank) #best_qid = random.choice([best_q, "Q00001"]) if preds_score[0] < 0.962: best_qid = "Q00001" preds = preds[:-1] preds.insert(0, best_qid) else: last_qid = "Q00001" preds = preds[:-1] preds.append(last_qid) for i, qid in enumerate(preds): fo.write('{} 0 {} {} {} BERT-based-v2\n'.format(tid, qid, i, len(preds)-i)) print("saved results to [%s]" % run_file_path)
def xdoc_convert_examples_to_features(processor, examples, tokenizer, max_length, label_list, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): if label_list is None: label_list = processor.get_labels() label_map = {label: i for i, label in enumerate(label_list)} features = [] for ex_index, example in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % ex_index) inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len(attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len(token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) label = label_map[example.label] if ex_index < 3: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) return features
def convert_examples_to_features(examples, tokenizer, max_length=512, label_list=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, sample_negatives=False): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True label_map = {label: i for i, label in enumerate(label_list)} features = [] if examples[0].text_b is not None: k = len(examples[0].text_b) if sample_negatives: neg_indices = [np.random.choice(len(examples), size=len(examples), replace=False) for i in range(k)] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) if type(example.text_a) is list: text_a = example.text_a text_b = [example.text_b]*len(text_a) elif type(example.text_b) is list: text_b = example.text_b if sample_negatives: label_idx = label_map[example.label] text_b_neg = [(examples[neg_indices[i][ex_index]]).text_b[label_idx] for i in range(k)] text_b_neg[label_idx] = text_b[label_idx] text_a = [example.text_a]*len(text_b) else: text_a = [example.text_a] text_b = [example.text_b] if 0: #sample_negatives: print ('Created negative samples') print ('Original example: label:{} text_a: {} text_b1: {}, 2: {}, 3:{}'.format(example.label, text_a[0], text_b[0], text_b[1], text_b[2])) print ('Converted example: text_a: {} text_b1: {}, 2: {}, 3:{}'.format(text_a[0], text_b_neg[0], text_b_neg[1], text_b_neg[2])) def get_indices(t1, t2): out = [] for a,b in zip(t1, t2): inputs = tokenizer.encode_plus( a, b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) out.append((input_ids, attention_mask, token_type_ids)) if len(t1) == 1: input_ids, attention_mask, token_type_ids = out[0] else: input_ids, attention_mask, token_type_ids = zip(*out) return input_ids, attention_mask, token_type_ids input_ids, attention_mask, token_type_ids = get_indices(text_a, text_b) if sample_negatives: input_ids_n, attention_mask_n, token_type_ids_n = get_indices(text_a, text_b_neg) label = label_map[example.label] if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if sample_negatives: features.append( InputFeatures(input_ids=input_ids_n, attention_mask=attention_mask_n, token_type_ids=token_type_ids_n, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ({'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask, 'token_type_ids': ex.token_type_ids}, ex.label) return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None])}, tf.TensorShape([]))) return features
def convert_examples_to_features(examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: GLUE task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` containing the task-specific features. If the input is a list of ``InputExamples``, will return a list of task-specific ``InputFeatures`` which can be fed to the model. """ is_tf_dataset = False if is_tf_available() and isinstance(examples, tf.data.Dataset): is_tf_dataset = True if task is not None: processor = processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) else: try: processor.set_labels(label_list) except: pass if output_mode is None: output_mode = output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) label_map = {label: i for i, label in enumerate(label_list)} features = [] for (ex_index, example) in enumerate(examples): if ex_index % 10000 == 0: logger.info("Writing example %d" % (ex_index)) if is_tf_dataset: example = processor.get_example_from_tensor_dict(example) example = processor.tfds_map(example) inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) if output_mode == "classification": label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) features.append( InputFeatures(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label)) if is_tf_available() and is_tf_dataset: def gen(): for ex in features: yield ({'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask, 'token_type_ids': ex.token_type_ids}, ex.label) return tf.data.Dataset.from_generator(gen, ({'input_ids': tf.int32, 'attention_mask': tf.int32, 'token_type_ids': tf.int32}, tf.int64), ({'input_ids': tf.TensorShape([None]), 'attention_mask': tf.TensorShape([None]), 'token_type_ids': tf.TensorShape([None])}, tf.TensorShape([]))) return features
def convert_examples_to_features(cls, examples, tokenizer, max_length, pad_token_segment_id, pad_token, mask_padding_with_zero=True): features = [] for ex_index, example in enumerate(examples): inputs = tokenizer.encode_plus( example.text_a, None, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs[ "token_type_ids"] attention_mask = [1 if mask_padding_with_zero else 0 ] * len(input_ids) padding_length = max_length - len(input_ids) input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ( [0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len( input_ids ) == max_length, "Error with input length {} vs {}".format( len(input_ids), max_length) assert len( attention_mask ) == max_length, "Error with input length {} vs {}".format( len(attention_mask), max_length) assert len( token_type_ids ) == max_length, "Error with input length {} vs {}".format( len(token_type_ids), max_length) if ex_index < 3: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) features.append( InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=None, )) return features
def convert_single_example(ex_index, example, label_list, max_seq_length, tokenizer): label_map = {} for (i, label) in enumerate(label_list): label_map[label] = i tokens_a = tokenizer.tokenize(example.text_a) tokens_b = None if example.text_b: tokens_b = tokenizer.tokenize(example.text_b) if tokens_b: # Modifies `tokens_a` and `tokens_b` in place so that the total # length is less than the specified length. # Account for [CLS], [SEP], [SEP] with "- 3" _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) else: # Account for [CLS] and [SEP] with "- 2" if len(tokens_a) > max_seq_length - 2: tokens_a = tokens_a[0:(max_seq_length - 2)] # The convention in BERT is: # (a) For sequence pairs: # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 # (b) For single sequences: # tokens: [CLS] the dog is hairy . [SEP] # type_ids: 0 0 0 0 0 0 0 # # Where "type_ids" are used to indicate whether this is the first # sequence or the second sequence. The embedding vectors for `type=0` and # `type=1` were learned during pre-training and are added to the wordpiece # embedding vector (and position vector). This is not *strictly* necessary # since the [SEP] token unambiguously separates the sequences, but it makes # it easier for the model to learn the concept of sequences. # # For classification tasks, the first vector (corresponding to [CLS]) is # used as the "sentence vector". Note that this only makes sense because # the entire model is fine-tuned. tokens = [] segment_ids = [] tokens.append("[CLS]") segment_ids.append(0) for token in tokens_a: tokens.append(token) segment_ids.append(0) tokens.append("[SEP]") segment_ids.append(0) if tokens_b: for token in tokens_b: tokens.append(token) segment_ids.append(1) tokens.append("[SEP]") segment_ids.append(1) input_ids = tokenizer.convert_tokens_to_ids(tokens) # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. input_mask = [1] * len(input_ids) # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(0) input_mask.append(0) segment_ids.append(0) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length label_id = label_map[example.label] feature = InputFeatures(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=None, label=label_id) return feature
def fever_convert_examples_to_features( examples, tokenizer, max_length=512, task=None, label_list=None, output_mode=None, pad_on_left=False, pad_token=0, pad_token_segment_id=0, mask_padding_with_zero=True, ): """ Loads a data file into a list of ``InputFeatures`` Args: examples: List of ``InputExamples`` containing the examples. tokenizer: Instance of a tokenizer that will tokenize the examples max_length: Maximum example length task: FEVER task label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method output_mode: String indicating the output mode. Either ``regression`` or ``classification`` pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) pad_token: Padding token pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for actual values) Returns: A list of task-specific ``InputFeatures`` which can be fed to the model. """ if task is not None: processor = fever_processors[task]() if label_list is None: label_list = processor.get_labels() logger.info("Using label list %s for task %s" % (label_list, task)) if output_mode is None: output_mode = fever_output_modes[task] logger.info("Using output mode %s for task %s" % (output_mode, task)) for (ex_index, example) in enumerate(examples): inputs = tokenizer.encode_plus( example.text_a, example.text_b, add_special_tokens=True, max_length=max_length, ) input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] # The mask has 1 for real tokens and 0 for padding tokens. Only real # tokens are attended to. attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) # Zero-pad up to the sequence length. padding_length = max_length - len(input_ids) if pad_on_left: input_ids = ([pad_token] * padding_length) + input_ids attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids else: input_ids = input_ids + ([pad_token] * padding_length) attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) if output_mode == "classification": label_map = {label: i for i, label in enumerate(label_list)} label = label_map[example.label] elif output_mode == "regression": label = float(example.label) else: raise KeyError(output_mode) if ex_index < 5: logger.info("*** Example ***") logger.info("guid: %s" % (example.guid)) logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) logger.info("label: %s (id = %d)" % (example.label, label)) yield InputFeatures( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=label, )
def __getitem__(self, idx): inputs = {k: self.batch_encoding[k][idx] for k in self.batch_encoding} return InputFeatures(**inputs, label=self.batch_labels[idx])
def _cache_instances_pickle(self): """ WARNING: This is no longer being used as a cache method and memmap should be prefered Loads tensors into memory or creates the dataset when it does not exist already. """ signature = "pointwise_set_{}_n_cand_docs_{}_ns_sampler_{}_seq_max_l_{}_sample_{}_for_{}_using_{}.pk".\ format(self.data_partition, self.negative_sampler.num_candidates_samples, self.negative_sampler.name, self.max_seq_len, self.sample_data, self.task_type, self.tokenizer.__class__.__name__) path = self.cache_path + "/" + signature if os.path.exists(path): with open(path, 'rb') as f: logging.info("Loading instances from {}".format(path)) self.instances = pickle.load(f) else: logging.info("Generating instances with signature {}".format(signature)) #Creating labels (currently there is support only for binary relevance) if self.task_type == "classification": relevant_label = 1 not_relevant_label = 0 elif self.task_type == "generation": relevant_label = "relevant </s>" not_relevant_label = "not_relevant </s>" labels = [] for r in self.data.itertuples(index=False): labels+=([relevant_label] * len(r[1])) #relevant documents are grouped at the second column. labels+=([not_relevant_label] * (self.negative_sampler.num_candidates_samples)) # each query has N negative samples. examples = [] for idx, row in enumerate(tqdm(self.data.itertuples(index=False), total=len(self.data))): query = row[0] relevant_documents = row[1] for relevant_document in relevant_documents: examples.append((query, relevant_document)) ns_candidates, _ , _ , _, _ = self.negative_sampler.sample(query, relevant_documents) for ns in ns_candidates: examples.append((query, ns)) logging.info("Encoding examples using tokenizer.batch_encode_plus().") batch_encoding = self.tokenizer(examples, max_length=self.max_seq_len, padding="max_length", truncation=True) if self.task_type == "generation": target_encodings = self.tokenizer(labels, max_length=10, padding="max_length", truncation=True) target_encodings = { "target_ids": target_encodings["input_ids"], "target_attention_mask": target_encodings["attention_mask"] } logging.info("Transforming examples to instances format.") self.instances = [] for i in range(len(examples)): inputs = {k: batch_encoding[k][i] for k in batch_encoding} if self.task_type == "generation": targets = {k: target_encodings[k][i] for k in target_encodings} feature = {**inputs, **targets} elif self.task_type == "classification": feature = InputFeatures(**inputs, label=labels[i]) self.instances.append(feature) for idx in range(3): logging.info("Set {} Instance {} query \n\n{}[...]\n".format(self.data_partition, idx, examples[idx][0][0:200])) logging.info("Set {} Instance {} document \n\n{}\n".format(self.data_partition, idx, examples[idx][1][0:200])) logging.info("Set {} Instance {} features \n\n{}\n".format(self.data_partition, idx, self.instances[idx])) with open(path, 'wb') as f: pickle.dump(self.instances, f) logging.info("Total of {} instances were cached.".format(len(self.instances)))