Esempio n. 1
0
def extract_tokens(title: str, abstract: str,
                   tkn: AutoTokenizer) -> np.ndarray:
    maxlen = 100
    title_tokens = tkn.encode_plus(
        title,
        add_special_tokens=True,
        truncation=True,
        max_length=maxlen,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='np',
    )
    abstract_tokens = tkn.encode_plus(
        abstract,
        add_special_tokens=True,
        truncation=True,
        max_length=maxlen,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='np',
    )
    t = np.concatenate(
        (title_tokens['input_ids'], title_tokens['attention_mask']), axis=0)
    a = np.concatenate(
        (abstract_tokens['input_ids'], abstract_tokens['attention_mask']),
        axis=0)
    r = np.concatenate((t, a))
    return r
Esempio n. 2
0
def create_dataset(df: pd.DataFrame, max_len: int, tokenizer: AutoTokenizer, batch_size: int, shuffle = False)->tf.data.Dataset:
    total_samples = df.shape[0]

    # Placeholders input
    input_ids, input_masks = [], []
    
    # Placeholder output
    labels = []

    # Tokenize
    for index, row in tqdm(zip(range(0, total_samples), df.iterrows()), total = total_samples):
        
        # Get title and description as strings
        text = row[1]['text']
        partisan = row[1]['partisan']

        # Encode
        input_encoded = tokenizer.encode_plus(text, add_special_tokens = True, max_length = max_len, truncation = True, padding = 'max_length')
        input_ids.append(input_encoded['input_ids'])
        input_masks.append(input_encoded['attention_mask'])
        labels.append(1 if partisan == 'true' else 0)

    # Prepare and Create TF Dataset.
    all_input_ids = tf.constant(input_ids)
    all_input_masks = tf.constant(input_masks)
    all_labels = tf.constant(labels)
    dataset =  tf.data.Dataset.from_tensor_slices(({'input_ids': all_input_ids, 'attention_mask': all_input_masks}, all_labels))
    if shuffle:
        dataset = dataset.shuffle(1024, reshuffle_each_iteration = True)
    dataset = dataset.batch(batch_size, drop_remainder = True)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    return dataset
Esempio n. 3
0
class TorchTransformersMLMPreprocessor(Component):
    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 return_tokens: bool = False,
                 **kwargs):
        self.max_seq_length = max_seq_length
        self.return_tokens = return_tokens
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

    def __call__(self, texts_a: List[str]):
        input_features = []
        tokens = []
        mask_idxs = []
        for text_a in texts_a:
            encoded_dict = self.tokenizer.encode_plus(
                text=text_a,
                add_special_tokens=True,
                max_length=self.max_seq_length,
                pad_to_max_length=True,
                return_attention_mask=True,
                return_tensors='pt')
            curr_features = InputFeatures(
                input_ids=encoded_dict['input_ids'],
                attention_mask=encoded_dict['attention_mask'],
                token_type_ids=encoded_dict['token_type_ids'],
                label=None)
            input_features.append(curr_features)
            if self.return_tokens:
                tokens.append(
                    self.tokenizer.convert_ids_to_tokens(
                        encoded_dict['input_ids'][0]))
            tokens = self.tokenizer.convert_ids_to_tokens(
                encoded_dict['input_ids'][0])
            mask_idx = 0
            for i in range(len(tokens)):
                if tokens[i] == '[MASK]':
                    mask_idx = i
            mask_idxs.append(mask_idx)
        if self.return_tokens:
            return input_features, tokens, mask_idxs
        else:
            return input_features, mask_idxs
Esempio n. 4
0
def get_error_samples(trained_model: NLIFineTuningModel,
                      df: pd.DataFrame,
                      tokenizer: transformers.AutoTokenizer,
                      max_length=256):
    """Get samples where model predicts incorrectly

    Args:
        trained_model (NLIFineTuningModel): saved model to make predictions
        df (pd.DataFrame): Dataframe with input text and labels
        tokenizer (transformers.AutoTokenizer): Tokenizer object to encode text input
        max_length (int, optional): Maximum permissible length of text to be considered. Defaults to 256

    Returns:
        [type]: [description]
    """
    error_samples = []
    for _, row in df.iterrows():
        sentence_1 = row[CONFIG['sentence1']]
        sentence_2 = row[CONFIG['sentence2']]
        gold_label = row[CONFIG['labels']]
        encoded_input = tokenizer.encode_plus(text=sentence_1,
                                              text_pair=sentence_2,
                                              add_special_tokens=True,
                                              padding='max_length',
                                              truncation=True,
                                              max_length=max_length,
                                              return_token_type_ids=True,
                                              return_attention_mask=True,
                                              return_tensors='pt')
        output = trained_model(encoded_input)
        predicted_label = torch.argmax(output.logits)
        if predicted_label.item() != gold_label:
            error_samples.append({
                'Sentence 1': sentence_1,
                'Sentence 2': sentence_2,
                'Ground Label': gold_label,
                'Predicted Label': predicted_label.item()
            })
    reverse_map = {0: 'entailment', 1: 'contradiction', 2: 'neutral'}
    if error_samples:
        error_df = pd.DataFrame(error_samples)
        error_df['Ground Label Text'] = error_df['Ground Label'].map(
            reverse_map)
        error_df['Predicted Label Text'] = error_df['Predicted Label'].map(
            reverse_map)
        return error_df
    else:
        print('LOL, No Errors!')
Esempio n. 5
0
class TorchTransformersMultiplechoicePreprocessor(Component):
    """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.

    Check details in :func:`bert_dp.preprocessing.convert_examples_to_features` function.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        return_tokens: whether to return tuple of input features and tokens, or only input features

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        return_tokens: whether to return tuple of input features and tokens, or only input features
        tokenizer: instance of Bert FullTokenizer

    """
    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 return_tokens: bool = False,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.return_tokens = return_tokens
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

    def tokenize_mc_examples(
            self, contexts: List[List[str]],
            choices: List[List[str]]) -> Dict[str, torch.tensor]:

        num_choices = len(contexts[0])
        batch_size = len(contexts)

        # tokenize examples in groups of `num_choices`
        examples = []
        for context_list, choice_list in zip(contexts, choices):
            for context, choice in zip(context_list, choice_list):
                tokenized_input = self.tokenizer.encode_plus(
                    text=context,
                    text_pair=choice,
                    return_attention_mask=True,
                    add_special_tokens=True,
                    truncation=True)

                examples.append(tokenized_input)

        padded_examples = self.tokenizer.pad(
            examples,
            padding=True,
            max_length=self.max_seq_length,
            return_tensors='pt',
        )

        padded_examples = {
            k: v.view(batch_size, num_choices, -1)
            for k, v in padded_examples.items()
        }

        return padded_examples

    def __call__(self,
                 texts_a: List[List[str]],
                 texts_b: List[List[str]] = None) -> Dict[str, torch.tensor]:
        """Tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """

        input_features = self.tokenize_mc_examples(texts_a, texts_b)
        return input_features
Esempio n. 6
0
class TorchSquadTransformersPreprocessor(Component):
    """Tokenize text on subtokens, encode subtokens with their indices, create tokens and segment masks.

    Check details in :func:`bert_dp.preprocessing.convert_examples_to_features` function.

    Args:
        vocab_file: path to vocabulary
        do_lower_case: set True if lowercasing is needed
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        return_tokens: whether to return tuple of input features and tokens, or only input features

    Attributes:
        max_seq_length: max sequence length in subtokens, including [SEP] and [CLS] tokens
        return_tokens: whether to return tuple of input features and tokens, or only input features
        tokenizer: instance of Bert FullTokenizer

    """
    def __init__(self,
                 vocab_file: str,
                 do_lower_case: bool = True,
                 max_seq_length: int = 512,
                 return_tokens: bool = False,
                 add_token_type_ids: bool = False,
                 **kwargs) -> None:
        self.max_seq_length = max_seq_length
        self.return_tokens = return_tokens
        self.add_token_type_ids = add_token_type_ids
        if Path(vocab_file).is_file():
            vocab_file = str(expand_path(vocab_file))
            self.tokenizer = AutoTokenizer(vocab_file=vocab_file,
                                           do_lower_case=do_lower_case)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(
                vocab_file, do_lower_case=do_lower_case)

    def __call__(
        self,
        texts_a: List[str],
        texts_b: Optional[List[str]] = None
    ) -> Union[List[InputFeatures], Tuple[List[InputFeatures],
                                          List[List[str]]]]:
        """Tokenize and create masks.

        texts_a and texts_b are separated by [SEP] token

        Args:
            texts_a: list of texts,
            texts_b: list of texts, it could be None, e.g. single sentence classification task

        Returns:
            batch of :class:`transformers.data.processors.utils.InputFeatures` with subtokens, subtoken ids, \
                subtoken mask, segment mask, or tuple of batch of InputFeatures and Batch of subtokens
        """

        if texts_b is None:
            texts_b = [None] * len(texts_a)

        input_features = []
        tokens = []
        for text_a, text_b in zip(texts_a, texts_b):
            encoded_dict = self.tokenizer.encode_plus(
                text=text_a,
                text_pair=text_b,
                add_special_tokens=True,
                max_length=self.max_seq_length,
                truncation=True,
                padding='max_length',
                return_attention_mask=True,
                return_tensors='pt')

            if 'token_type_ids' not in encoded_dict:
                if self.add_token_type_ids:
                    input_ids = encoded_dict['input_ids']
                    seq_len = input_ids.size(1)
                    sep = torch.where(
                        input_ids == self.tokenizer.sep_token_id)[1][0].item()
                    len_a = min(sep + 1, seq_len)
                    len_b = seq_len - len_a
                    encoded_dict['token_type_ids'] = torch.cat((torch.zeros(
                        1, len_a, dtype=int), torch.ones(1, len_b, dtype=int)),
                                                               dim=1)
                else:
                    encoded_dict['token_type_ids'] = torch.tensor([0])

            curr_features = InputFeatures(
                input_ids=encoded_dict['input_ids'],
                attention_mask=encoded_dict['attention_mask'],
                token_type_ids=encoded_dict['token_type_ids'],
                label=None)
            input_features.append(curr_features)
            if self.return_tokens:
                tokens.append(
                    self.tokenizer.convert_ids_to_tokens(
                        encoded_dict['input_ids'][0]))

        if self.return_tokens:
            return input_features, tokens
        else:
            return input_features