def extract_tokens(title: str, abstract: str, tkn: AutoTokenizer) -> np.ndarray: maxlen = 100 title_tokens = tkn.encode_plus( title, add_special_tokens=True, truncation=True, max_length=maxlen, pad_to_max_length=True, return_attention_mask=True, return_tensors='np', ) abstract_tokens = tkn.encode_plus( abstract, add_special_tokens=True, truncation=True, max_length=maxlen, pad_to_max_length=True, return_attention_mask=True, return_tensors='np', ) t = np.concatenate( (title_tokens['input_ids'], title_tokens['attention_mask']), axis=0) a = np.concatenate( (abstract_tokens['input_ids'], abstract_tokens['attention_mask']), axis=0) r = np.concatenate((t, a)) return r
def create_dataset(df: pd.DataFrame, max_len: int, tokenizer: AutoTokenizer, batch_size: int, shuffle = False)->tf.data.Dataset: total_samples = df.shape[0] # Placeholders input input_ids, input_masks = [], [] # Placeholder output labels = [] # Tokenize for index, row in tqdm(zip(range(0, total_samples), df.iterrows()), total = total_samples): # Get title and description as strings text = row[1]['text'] partisan = row[1]['partisan'] # Encode input_encoded = tokenizer.encode_plus(text, add_special_tokens = True, max_length = max_len, truncation = True, padding = 'max_length') input_ids.append(input_encoded['input_ids']) input_masks.append(input_encoded['attention_mask']) labels.append(1 if partisan == 'true' else 0) # Prepare and Create TF Dataset. all_input_ids = tf.constant(input_ids) all_input_masks = tf.constant(input_masks) all_labels = tf.constant(labels) dataset = tf.data.Dataset.from_tensor_slices(({'input_ids': all_input_ids, 'attention_mask': all_input_masks}, all_labels)) if shuffle: dataset = dataset.shuffle(1024, reshuffle_each_iteration = True) dataset = dataset.batch(batch_size, drop_remainder = True) dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE) return dataset
class TorchTransformersMLMPreprocessor(Component): def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, return_tokens: bool = False, **kwargs): self.max_seq_length = max_seq_length self.return_tokens = return_tokens if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) def __call__(self, texts_a: List[str]): input_features = [] tokens = [] mask_idxs = [] for text_a in texts_a: encoded_dict = self.tokenizer.encode_plus( text=text_a, add_special_tokens=True, max_length=self.max_seq_length, pad_to_max_length=True, return_attention_mask=True, return_tensors='pt') curr_features = InputFeatures( input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features.append(curr_features) if self.return_tokens: tokens.append( self.tokenizer.convert_ids_to_tokens( encoded_dict['input_ids'][0])) tokens = self.tokenizer.convert_ids_to_tokens( encoded_dict['input_ids'][0]) mask_idx = 0 for i in range(len(tokens)): if tokens[i] == '[MASK]': mask_idx = i mask_idxs.append(mask_idx) if self.return_tokens: return input_features, tokens, mask_idxs else: return input_features, mask_idxs
def get_error_samples(trained_model: NLIFineTuningModel, df: pd.DataFrame, tokenizer: transformers.AutoTokenizer, max_length=256): """Get samples where model predicts incorrectly Args: trained_model (NLIFineTuningModel): saved model to make predictions df (pd.DataFrame): Dataframe with input text and labels tokenizer (transformers.AutoTokenizer): Tokenizer object to encode text input max_length (int, optional): Maximum permissible length of text to be considered. Defaults to 256 Returns: [type]: [description] """ error_samples = [] for _, row in df.iterrows(): sentence_1 = row[CONFIG['sentence1']] sentence_2 = row[CONFIG['sentence2']] gold_label = row[CONFIG['labels']] encoded_input = tokenizer.encode_plus(text=sentence_1, text_pair=sentence_2, add_special_tokens=True, padding='max_length', truncation=True, max_length=max_length, return_token_type_ids=True, return_attention_mask=True, return_tensors='pt') output = trained_model(encoded_input) predicted_label = torch.argmax(output.logits) if predicted_label.item() != gold_label: error_samples.append({ 'Sentence 1': sentence_1, 'Sentence 2': sentence_2, 'Ground Label': gold_label, 'Predicted Label': predicted_label.item() }) reverse_map = {0: 'entailment', 1: 'contradiction', 2: 'neutral'} if error_samples: error_df = pd.DataFrame(error_samples) error_df['Ground Label Text'] = error_df['Ground Label'].map( reverse_map) error_df['Predicted Label Text'] = error_df['Predicted Label'].map( reverse_map) return error_df else: print('LOL, No Errors!')
class TorchTransformersMultiplechoicePreprocessor(Component): """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks. Check details in :func:`bert_dp.preprocessing.convert_examples_to_features` function. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens return_tokens: whether to return tuple of input features and tokens, or only input features Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens return_tokens: whether to return tuple of input features and tokens, or only input features tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, return_tokens: bool = False, **kwargs) -> None: self.max_seq_length = max_seq_length self.return_tokens = return_tokens if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) def tokenize_mc_examples( self, contexts: List[List[str]], choices: List[List[str]]) -> Dict[str, torch.tensor]: num_choices = len(contexts[0]) batch_size = len(contexts) # tokenize examples in groups of `num_choices` examples = [] for context_list, choice_list in zip(contexts, choices): for context, choice in zip(context_list, choice_list): tokenized_input = self.tokenizer.encode_plus( text=context, text_pair=choice, return_attention_mask=True, add_special_tokens=True, truncation=True) examples.append(tokenized_input) padded_examples = self.tokenizer.pad( examples, padding=True, max_length=self.max_seq_length, return_tensors='pt', ) padded_examples = { k: v.view(batch_size, num_choices, -1) for k, v in padded_examples.items() } return padded_examples def __call__(self, texts_a: List[List[str]], texts_b: List[List[str]] = None) -> Dict[str, torch.tensor]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ input_features = self.tokenize_mc_examples(texts_a, texts_b) return input_features
class TorchSquadTransformersPreprocessor(Component): """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks. Check details in :func:`bert_dp.preprocessing.convert_examples_to_features` function. Args: vocab_file: path to vocabulary do_lower_case: set True if lowercasing is needed max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens return_tokens: whether to return tuple of input features and tokens, or only input features Attributes: max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens return_tokens: whether to return tuple of input features and tokens, or only input features tokenizer: instance of Bert FullTokenizer """ def __init__(self, vocab_file: str, do_lower_case: bool = True, max_seq_length: int = 512, return_tokens: bool = False, add_token_type_ids: bool = False, **kwargs) -> None: self.max_seq_length = max_seq_length self.return_tokens = return_tokens self.add_token_type_ids = add_token_type_ids if Path(vocab_file).is_file(): vocab_file = str(expand_path(vocab_file)) self.tokenizer = AutoTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case) else: self.tokenizer = AutoTokenizer.from_pretrained( vocab_file, do_lower_case=do_lower_case) def __call__( self, texts_a: List[str], texts_b: Optional[List[str]] = None ) -> Union[List[InputFeatures], Tuple[List[InputFeatures], List[List[str]]]]: """Tokenize and create masks. texts_a and texts_b are separated by [SEP] token Args: texts_a: list of texts, texts_b: list of texts, it could be None, e.g. single sentence classification task Returns: batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \ subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens """ if texts_b is None: texts_b = [None] * len(texts_a) input_features = [] tokens = [] for text_a, text_b in zip(texts_a, texts_b): encoded_dict = self.tokenizer.encode_plus( text=text_a, text_pair=text_b, add_special_tokens=True, max_length=self.max_seq_length, truncation=True, padding='max_length', return_attention_mask=True, return_tensors='pt') if 'token_type_ids' not in encoded_dict: if self.add_token_type_ids: input_ids = encoded_dict['input_ids'] seq_len = input_ids.size(1) sep = torch.where( input_ids == self.tokenizer.sep_token_id)[1][0].item() len_a = min(sep + 1, seq_len) len_b = seq_len - len_a encoded_dict['token_type_ids'] = torch.cat((torch.zeros( 1, len_a, dtype=int), torch.ones(1, len_b, dtype=int)), dim=1) else: encoded_dict['token_type_ids'] = torch.tensor([0]) curr_features = InputFeatures( input_ids=encoded_dict['input_ids'], attention_mask=encoded_dict['attention_mask'], token_type_ids=encoded_dict['token_type_ids'], label=None) input_features.append(curr_features) if self.return_tokens: tokens.append( self.tokenizer.convert_ids_to_tokens( encoded_dict['input_ids'][0])) if self.return_tokens: return input_features, tokens else: return input_features