def pytorch_benchmark(batch_sizes, sequence_lengths, nums_random_blocks, output_path, attention_type="block_sparse"): # Compare takes a list of measurements which we'll save in results. device = torch.device("cuda") fp = open(output_path, "w") writer = csv.writer(fp) writer.writerow(["batch_size", "seq_length", "r", "forward time (ms)", "bakward time (ms)"]) tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base") for b, n, r in product(batch_sizes, sequence_lengths, nums_random_blocks): print(b, n, r) inputs = tokenizer([input_text for _ in range(b)], max_length=n, truncation=True, return_tensors="pt") config = BigBirdConfig.from_pretrained("google/bigbird-roberta-base", attention_type=attention_type) model = BigBirdForSequenceClassification.from_pretrained("google/bigbird-roberta-base", config=config) model.to(device) try: torch.cuda.synchronize() forward_time = 0 backward_time = 0 for _ in range(10): forward_elapse, backward_elapse = time_foward_backward(model, inputs) forward_time += forward_elapse backward_time += backward_elapse forward_time /= 10 backward_time /= 10 print(forward_time, backward_time) writer.writerow([b, n, r, forward_time, backward_time]) except Exception as e: print("Error:", e) traceback.print_exc() fp.close()
def test_special_tokens(self): """ To reproduce: $ wget https://github.com/google-research/bigbird/blob/master/bigbird/vocab/gpt2.model?raw=true $ mv gpt2.model?raw=true gpt2.model ``` import tensorflow_text as tft import tensorflow as tf vocab_model_file = "./gpt2.model" tokenizer = tft.SentencepieceTokenizer(model=tf.io.gfile.GFile(vocab_model_file, "rb").read())) ids = tokenizer.tokenize("Paris is the [MASK].") ids = tf.concat([tf.constant([65]), ids, tf.constant([66])], axis=0) detokenized = tokenizer.detokenize(ids) # should give [CLS] Paris is the [MASK].[SEP] """ tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base") decoded_text = tokenizer.decode(tokenizer("Paris is the [MASK].").input_ids) self.assertTrue(decoded_text == "[CLS] Paris is the[MASK].[SEP]")
def test_full_tokenizer(self): tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True) tokens = tokenizer.tokenize("This is a test") self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) self.assertListEqual( tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382], ) tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") self.assertListEqual( tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", ".", ], ) ids = tokenizer.convert_tokens_to_ids(tokens) self.assertListEqual( ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4], ) back_tokens = tokenizer.convert_ids_to_tokens(ids) self.assertListEqual( back_tokens, [ SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", SPIECE_UNDERLINE + "", "<unk>", "2", "0", "0", "0", ",", SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "<unk>", ".", ], )
def big_tokenizer(self): return BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base")
def setUp(self): super().setUp() tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname)
def load(cls, pretrained_model_name_or_path, revision=None, tokenizer_class=None, use_fast=True, **kwargs): """ Enables loading of different Tokenizer classes with a uniform interface. Either infer the class from model config or define it manually via `tokenizer_class`. :param pretrained_model_name_or_path: The path of the saved pretrained model or its name (e.g. `bert-base-uncased`) :type pretrained_model_name_or_path: str :param revision: The version of model to use from the HuggingFace model hub. Can be tag name, branch name, or commit hash. :type revision: str :param tokenizer_class: (Optional) Name of the tokenizer class to load (e.g. `BertTokenizer`) :type tokenizer_class: str :param use_fast: (Optional, False by default) Indicate if FARM should try to load the fast version of the tokenizer (True) or use the Python one (False). Only DistilBERT, BERT and Electra fast tokenizers are supported. :type use_fast: bool :param kwargs: :return: Tokenizer """ pretrained_model_name_or_path = str(pretrained_model_name_or_path) kwargs["revision"] = revision if tokenizer_class is None: tokenizer_class = cls._infer_tokenizer_class( pretrained_model_name_or_path) logger.info(f"Loading tokenizer of type '{tokenizer_class}'") # return appropriate tokenizer object ret = None if "AlbertTokenizer" in tokenizer_class: if use_fast: ret = AlbertTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = AlbertTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "XLMRobertaTokenizer" in tokenizer_class: if use_fast: ret = XLMRobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = XLMRobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "RobertaTokenizer" in tokenizer_class: if use_fast: ret = RobertaTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = RobertaTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DistilBertTokenizer" in tokenizer_class: if use_fast: ret = DistilBertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DistilBertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BertTokenizer" in tokenizer_class: if use_fast: ret = BertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "XLNetTokenizer" in tokenizer_class: if use_fast: ret = XLNetTokenizerFast.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) else: ret = XLNetTokenizer.from_pretrained( pretrained_model_name_or_path, keep_accents=True, **kwargs) elif "ElectraTokenizer" in tokenizer_class: if use_fast: ret = ElectraTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = ElectraTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif tokenizer_class == "EmbeddingTokenizer": if use_fast: logger.error( 'EmbeddingTokenizerFast is not supported! Using EmbeddingTokenizer instead.' ) ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = EmbeddingTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "CamembertTokenizer" in tokenizer_class: if use_fast: ret = CamembertTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = CamembertTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRQuestionEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRQuestionEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRQuestionEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "DPRContextEncoderTokenizer" in tokenizer_class: if use_fast: ret = DPRContextEncoderTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = DPRContextEncoderTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) elif "BigBirdTokenizer" in tokenizer_class: if use_fast: ret = BigBirdTokenizerFast.from_pretrained( pretrained_model_name_or_path, **kwargs) else: ret = BigBirdTokenizer.from_pretrained( pretrained_model_name_or_path, **kwargs) if ret is None: raise Exception("Unable to load tokenizer") else: return ret
from transformers import BigBirdModel, BertModel from transformers import BigBirdTokenizer, BertTokenizer from transformers import RobertaTokenizer, RobertaModel from tqdm import tqdm import time import numpy as np import torch bigbird = 'google/bigbird-roberta-base' bert = 'roberta-base' bbtokenizer = BigBirdTokenizer.from_pretrained(bigbird) bbmodel = BigBirdModel.from_pretrained(bigbird) bttokenizer = RobertaTokenizer.from_pretrained(bert) btmodel = RobertaModel.from_pretrained(bert) use_bigbird = True if use_bigbird: tokenizer = bbtokenizer model = bbmodel else: tokenizer = bttokenizer model = btmodel def get_latency(model, inputs): start = time.time() for _ in tqdm(range(100)): output = model(**inputs)
writer.write({ "input_ids": ids, "start_token": start, "end_token": end, "category": CATEGORY_MAPPING[cat], }) if __name__ == "__main__": """Running area""" from datasets import load_dataset from transformers import BigBirdTokenizer data = load_dataset("natural_questions") tokenizer = BigBirdTokenizer.from_pretrained("google/bigbird-roberta-base") data = data["train" if PROCESS_TRAIN == "true" else "validation"] fn_kwargs = dict( tokenizer=tokenizer, doc_stride=DOC_STRIDE, max_length=MAX_LENGTH, assertion=False, ) data = data.map(prepare_inputs, fn_kwargs=fn_kwargs) data = data.remove_columns(["annotations", "document", "id", "question"]) print(data) np.random.seed(SEED) cache_file_name = "nq-training.jsonl" if PROCESS_TRAIN == "true" else "nq-validation.jsonl"