Esempio n. 1
0
def pytorch_benchmark(batch_sizes, sequence_lengths, nums_random_blocks, output_path, attention_type="block_sparse"):
    # Compare takes a list of measurements which we'll save in results.
    device = torch.device("cuda")

    fp = open(output_path, "w")
    writer = csv.writer(fp)
    writer.writerow(["batch_size", "seq_length", "r", "forward time (ms)", "bakward time (ms)"])
    tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
    for b, n, r in product(batch_sizes, sequence_lengths, nums_random_blocks):
        print(b, n, r)
        inputs = tokenizer([input_text for _ in range(b)], max_length=n, truncation=True, return_tensors="pt")
        config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base", attention_type=attention_type)
        model = BigBirdForSequenceClassification.from_pretrained("google/bigbird-roberta-base", config=config)
        model.to(device)
        try:
            torch.cuda.synchronize()
            forward_time = 0
            backward_time = 0
            for _ in range(10):
                forward_elapse, backward_elapse = time_foward_backward(model, inputs)

                forward_time += forward_elapse
                backward_time += backward_elapse
            forward_time /= 10
            backward_time /= 10
            print(forward_time, backward_time)
            writer.writerow([b, n, r, forward_time, backward_time])
        except Exception as e:
            print("Error:", e)
            traceback.print_exc()

    fp.close()
    def test_special_tokens(self):
        """
        To reproduce:

        $ wget https://github.com/google-research/bigbird/blob/master/bigbird/vocab/gpt2.model?raw=true
        $ mv gpt2.model?raw=true gpt2.model

        ```
        import tensorflow_text as tft
        import tensorflow as tf

        vocab_model_file = "./gpt2.model"
        tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(vocab_model_file, "rb").read()))
        ids = tokenizer.tokenize("Paris is the [MASK].")
        ids = tf.concat([tf.constant([65]), ids, tf.constant([66])], axis=0)
        detokenized = tokenizer.detokenize(ids)  # should give [CLS] Paris is the [MASK].[SEP]
        """
        tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
        decoded_text = tokenizer.decode(tokenizer("Paris is the [MASK].").input_ids)

        self.assertTrue(decoded_text == "[CLS] Paris is the[MASK].[SEP]")
    def test_full_tokenizer(self):
        tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)

        tokens = tokenizer.tokenize("This is a test")
        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])

        self.assertListEqual(
            tokenizer.convert_tokens_to_ids(tokens),
            [285, 46, 10, 170, 382],
        )

        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
        self.assertListEqual(
            tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "9",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "é",
                ".",
            ],
        )
        ids = tokenizer.convert_tokens_to_ids(tokens)
        self.assertListEqual(
            ids,
            [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4],
        )

        back_tokens = tokenizer.convert_ids_to_tokens(ids)
        self.assertListEqual(
            back_tokens,
            [
                SPIECE_UNDERLINE + "I",
                SPIECE_UNDERLINE + "was",
                SPIECE_UNDERLINE + "b",
                "or",
                "n",
                SPIECE_UNDERLINE + "in",
                SPIECE_UNDERLINE + "",
                "<unk>",
                "2",
                "0",
                "0",
                "0",
                ",",
                SPIECE_UNDERLINE + "and",
                SPIECE_UNDERLINE + "this",
                SPIECE_UNDERLINE + "is",
                SPIECE_UNDERLINE + "f",
                "al",
                "s",
                "<unk>",
                ".",
            ],
        )
 def big_tokenizer(self):
     return BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
    def setUp(self):
        super().setUp()

        tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.save_pretrained(self.tmpdirname)
Esempio n. 6
0
    def load(cls,
             pretrained_model_name_or_path,
             revision=None,
             tokenizer_class=None,
             use_fast=True,
             **kwargs):
        """
        Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from
        model config or define it manually via `tokenizer_class`.

        :param pretrained_model_name_or_path:  The path of the saved pretrained model or its name (e.g. `bert-base-uncased`)
        :type pretrained_model_name_or_path: str
        :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash.
        :type revision: str
        :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`)
        :type tokenizer_class: str
        :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or
            use the Python one (False).
            Only DistilBERT, BERT and Electra fast tokenizers are supported.
        :type use_fast: bool
        :param kwargs:
        :return: Tokenizer
        """
        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
        kwargs["revision"] = revision

        if tokenizer_class is None:
            tokenizer_class = cls._infer_tokenizer_class(
                pretrained_model_name_or_path)

        logger.info(f"Loading tokenizer of type '{tokenizer_class}'")
        # return appropriate tokenizer object
        ret = None
        if "AlbertTokenizer" in tokenizer_class:
            if use_fast:
                ret = AlbertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = AlbertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "XLMRobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLMRobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = XLMRobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "RobertaTokenizer" in tokenizer_class:
            if use_fast:
                ret = RobertaTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = RobertaTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DistilBertTokenizer" in tokenizer_class:
            if use_fast:
                ret = DistilBertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DistilBertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BertTokenizer" in tokenizer_class:
            if use_fast:
                ret = BertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "XLNetTokenizer" in tokenizer_class:
            if use_fast:
                ret = XLNetTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
            else:
                ret = XLNetTokenizer.from_pretrained(
                    pretrained_model_name_or_path, keep_accents=True, **kwargs)
        elif "ElectraTokenizer" in tokenizer_class:
            if use_fast:
                ret = ElectraTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = ElectraTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif tokenizer_class == "EmbeddingTokenizer":
            if use_fast:
                logger.error(
                    'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.'
                )
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = EmbeddingTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "CamembertTokenizer" in tokenizer_class:
            if use_fast:
                ret = CamembertTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = CamembertTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRQuestionEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRQuestionEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRQuestionEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "DPRContextEncoderTokenizer" in tokenizer_class:
            if use_fast:
                ret = DPRContextEncoderTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = DPRContextEncoderTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        elif "BigBirdTokenizer" in tokenizer_class:
            if use_fast:
                ret = BigBirdTokenizerFast.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
            else:
                ret = BigBirdTokenizer.from_pretrained(
                    pretrained_model_name_or_path, **kwargs)
        if ret is None:
            raise Exception("Unable to load tokenizer")
        else:
            return ret
Esempio n. 7
0
from transformers import BigBirdModel, BertModel
from transformers import BigBirdTokenizer, BertTokenizer
from transformers import RobertaTokenizer, RobertaModel
from tqdm import tqdm
import time
import numpy as np
import torch

bigbird = 'google/bigbird-roberta-base'
bert = 'roberta-base'

bbtokenizer = BigBirdTokenizer.from_pretrained(bigbird)
bbmodel = BigBirdModel.from_pretrained(bigbird)

bttokenizer = RobertaTokenizer.from_pretrained(bert)
btmodel = RobertaModel.from_pretrained(bert)

use_bigbird = True

if use_bigbird:
    tokenizer = bbtokenizer
    model = bbmodel
else:
    tokenizer = bttokenizer
    model = btmodel


def get_latency(model, inputs):
    start = time.time()
    for _ in tqdm(range(100)):
        output = model(**inputs)
                writer.write({
                    "input_ids": ids,
                    "start_token": start,
                    "end_token": end,
                    "category": CATEGORY_MAPPING[cat],
                })


if __name__ == "__main__":
    """Running area"""
    from datasets import load_dataset

    from transformers import BigBirdTokenizer

    data = load_dataset("natural_questions")
    tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")

    data = data["train" if PROCESS_TRAIN == "true" else "validation"]

    fn_kwargs = dict(
        tokenizer=tokenizer,
        doc_stride=DOC_STRIDE,
        max_length=MAX_LENGTH,
        assertion=False,
    )
    data = data.map(prepare_inputs, fn_kwargs=fn_kwargs)
    data = data.remove_columns(["annotations", "document", "id", "question"])
    print(data)

    np.random.seed(SEED)
    cache_file_name = "nq-training.jsonl" if PROCESS_TRAIN == "true" else "nq-validation.jsonl"