def test_shape_on_random_data(self): set_seed(42) bs = 3 src_len = 5 tgt_len = 7 encoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=17, num_hidden_layers=1, num_attention_heads=1, ) encoder = transformers.BertModel(encoder_config) # decoder accepts vocabulary of schema vocab + pointer embeddings decoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=23, is_decoder=True, num_hidden_layers=1, num_attention_heads=1, ) decoder = transformers.BertModel(decoder_config) # logits are projected into schema vocab and combined with pointer scores max_pointer = src_len + 3 model = EncoderDecoderWPointerModel(encoder=encoder, decoder=decoder, max_src_len=max_pointer) x_enc = torch.randint(0, encoder_config.vocab_size, size=(bs, src_len)) x_dec = torch.randint(0, decoder_config.vocab_size, size=(bs, tgt_len)) out = model(input_ids=x_enc, decoder_input_ids=x_dec) # different encoders return different number of outputs # e.g. BERT returns two, but DistillBERT only one self.assertGreaterEqual(len(out), 4) schema_vocab = decoder_config.vocab_size - max_pointer combined_logits = out[0] expected_shape = (bs, tgt_len, schema_vocab + src_len) self.assertEqual(combined_logits.shape, expected_shape) decoder_hidden = out[1] expected_shape = (bs, tgt_len, decoder_config.hidden_size) self.assertEqual(decoder_hidden.shape, expected_shape) combined_logits = out[2] expected_shape = (bs, decoder_config.hidden_size) self.assertEqual(combined_logits.shape, expected_shape) encoder_hidden = out[3] expected_shape = (bs, src_len, encoder_config.hidden_size) self.assertEqual(encoder_hidden.shape, expected_shape)
def __init__(self, code_token_counter, query_token_counter): self.code_token_counter = code_token_counter get_counter_map(code_token_counter) self.code_config = transformers.BertConfig( vocab_size=len(code_token_counter), pad_token_id=get_counter_map(code_token_counter)["[PAD]"]) self.code_model = transformers.BertModel(self.code_config) self.query_token_counter = query_token_counter self.query_config = transformers.BertConfig( vocab_size=len(query_token_counter), pad_token_id=get_counter_map(query_token_counter)["[PAD]"]) self.query_model = transformers.BertModel(self.query_config)
def test_loss_computation(self): torch.manual_seed(42) src_vocab_size = 17 tgt_vocab_size = 23 encoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=src_vocab_size, num_hidden_layers=1, num_attention_heads=1, ) encoder = transformers.BertModel(encoder_config) max_position = 7 decoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=tgt_vocab_size + max_position, is_decoder=True, num_hidden_layers=1, num_attention_heads=1, ) decoder = transformers.BertModel(decoder_config) model = EncoderDecoderWPointerModel(encoder=encoder, decoder=decoder, max_src_len=7) # similar to real data src_seq = torch.LongTensor([[1, 6, 12, 15, 2, 0, 0], [1, 6, 12, 15, 5, 3, 2]]) tgt_seq = torch.LongTensor([ [8, 6, 4, 10, 11, 8, 5, 1, 12, 7, 7, 0, 0], [8, 6, 4, 10, 11, 8, 5, 1, 12, 13, 14, 7, 7], ]) mask = torch.FloatTensor([[0, 1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 1, 0]]) loss = model( input_ids=src_seq, decoder_input_ids=tgt_seq, pointer_mask=mask, labels=tgt_seq, )[0] self.assertEqual(loss.shape, torch.Size([])) self.assertEqual(loss.dtype, torch.float32) self.assertGreater(loss, 0)
def generate_onnx_model(model_name: str, filename: str, seq_len: int, batch_size: int, backend: str): import transformers import torch import os test_device = torch.device('cuda:0') if backend == "GPU" else torch.device( 'cpu:0') torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: torch.onnx.export(model=model, args=(input_ids, ), f=outf) outf.flush() return cfg.vocab_size
def __init__(self, vocab_size, hidden_size, dropout, n_layers=1, vocab_file='./data/vocab.txt'): super(UntrainedEncoderBERT, self).__init__() self.vocab_size = vocab_size self.hidden_size = hidden_size self.dropout = dropout self.dropout_layer = nn.Dropout(dropout) self.embedding = nn.Embedding(vocab_size, hidden_size, padding_idx=PAD_token) self.embedding.weight.data.normal_(0, 0.1) self.config = transformers.BertConfig(vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=n_layers, hidden_dropout_prob=dropout, attention_probs_dropout=dropout, num_attention_heads=16, output_hidden_states=True, max_position_embeddings=1024) self.tokenizer = transformers.BertTokenizer(vocab_file, pad_token='PAD', unk_token='UNK', sep_token='EOS') self.BERT = transformers.BertModel(self.config) self.training = True
def dataset(): """ Check if the data in two instances is different """ args = "--config demo_tiny_128".split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) loader = TFRecordPretrainingDataset(config.input_files) loader = get_dataloader(config, opts) # Save part of the data as list loader_list = list(loader)[0][0][0].numpy() # MPI to broadcast data in root=1 to root=0 from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() loader_list_copy = np.copy(loader_list) comm.Bcast(loader_list, root=1) # Assert if data broadcast to root=0 is different if comm.Get_rank() == 0 and not np.all(loader_list_copy == loader_list): print('Passed test: instances have different data') # Wait until both roots are finished time.sleep(2)
def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, num_threads: int): import torch import transformers import contexttimer import benchmark_helper torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) benchmark_helper.run_model(lambda: model(input_ids), False, n, batch_size, seq_len, "torch", num_threads)
def __init__(self, hp: Optional[ModelParams] = ModelParams(), max_seq_len=1024): super().__init__() self.hp = hp config = transformers.BertConfig( hidden_size=self.hp.dim, num_hidden_layers=0, num_attention_heads=1, intermediate_size=0, max_position_embeddings=max_seq_len, output_attentions=False, output_hidden_states=False, return_dict=True, ) self.e = transformers.BertModel(config, add_pooling_layer=False) for name, param in self.e.named_parameters(): # param names # embeddings.word_embeddings.weight # embeddings.position_embeddings.weight # embeddings.token_type_embeddings.weight # embeddings.LayerNorm.weight # embeddings.LayerNorm.bias if 'position_embeddings' in name: requires_grad = self.hp.position_embedding_requires_grad else: requires_grad = self.hp.requires_grad param.requires_grad = requires_grad
def __init__( self, train_dataloader, val_dataloader, args, fold, baselines=None, gamma=0.99, beta=0.01, lr=1e-5, device="cpu", ): if args.decoder == "lstm": self.policy = PolicyNet(args=args) else: config = transformers.BertConfig() config.hidden_size = 2048 config.num_attention_heads = 8 config.num_hidden_layers = 4 config.max_position_embeddings = 1500 self.policy = Transformer(config, device) self.policy.to(device) self.optimizer = torch.optim.Adam(self.policy.parameters(), lr=lr) self.baselines = baselines self.train_dataloader = train_dataloader self.val_dataloader = val_dataloader self.device = device self.args = args self.fold = fold self.beta = beta
def get_torch_model( model_name: str, input_shape: Tuple[int, ...], output_shape: Tuple[int, int], # pylint: disable=unused-argument dtype: str = "float32", ) -> Tuple[IRModule, Dict[str, NDArray]]: """Load model from torch model zoo Parameters ---------- model_name : str The name of the model to load input_shape: Tuple[int, ...] Tuple for input shape output_shape: Tuple[int, int] Tuple for output shape dtype: str Tensor data type """ assert dtype == "float32" import torch # type: ignore # pylint: disable=import-error,import-outside-toplevel from torchvision import models # type: ignore # pylint: disable=import-error,import-outside-toplevel import transformers # type: ignore # pylint: disable=import-error,import-outside-toplevel import os # type: ignore # pylint: disable=import-error,import-outside-toplevel def do_trace(model, inp): model.eval() model_trace = torch.jit.trace(model, inp) model_trace.eval() return model_trace # Load model from torchvision if MODEL_TYPES[model_name] == MODEL_TYPE.TEXT_CLASSIFICATION: os.environ["TOKENIZERS_PARALLELISM"] = "false" model = transformers.BertModel( transformers.BertConfig( num_hidden_layers=12, hidden_size=768, intermediate_size=3072, num_attention_heads=12, return_dict=False, )) model.eval() input_data = torch.randint(10000, input_shape) shape_list = [("input_ids", input_shape)] scripted_model = torch.jit.trace(model, [input_data], strict=False) elif MODEL_TYPES[model_name] == MODEL_TYPE.IMAGE_CLASSIFICATION: model = getattr(models, model_name)() # Setup input input_data = torch.randn(input_shape).type(torch.float32) shape_list = [("input0", input_shape)] # Get trace. Depending on the model type, wrapper may be necessary. scripted_model = do_trace(model, input_data) else: raise ValueError("Unsupported model in Torch model zoo.") # Convert torch model to relay module mod, params = relay.frontend.from_pytorch(scripted_model, shape_list) return mod, params
def generate_onnx_model(model_name: str, use_gpu: bool, filename: str, seq_len: int, batch_size: int, backend: str, use_dynamic_axes: bool = False): import transformers import torch import os test_device = torch.device( 'cuda:0') if backend == "GPU" and use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) if model_name == "bert": # use a real model to check the correctness if checkonnxrest: model = transformers.BertModel.from_pretrained("bert-base-uncased") else: cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) with open(filename, 'wb') as outf: if not use_dynamic_axes: torch.onnx.export(model=model, args=(input_ids, ), f=outf) else: torch.onnx.export(model=model, args=(input_ids, ), f=outf, input_names=['input'], output_names=['output'], dynamic_axes={ 'input': [0, 1], 'output': [0, 1] }) # If not intended to make onnxruntime support variable batch size and sequence length, # you can unset the parameter `dynamic_axes`. # For some model, you have to try `opset_version=12` outf.flush() return cfg.vocab_size, cfg
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model, backend="turbo") elif model_name == "albert": cfg = transformers.AlbertConfig(hidden_size=768, num_attention_heads=12, intermediate_size=3072) model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.DistilBertModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: if enable_mem_opt: turbo_transformers.reset_allocator_schema("model-aware") benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg, enable_mem_opt, model_name) if enable_mem_opt: turbo_transformers.reset_allocator_schema("naive") else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads, enable_mem_opt, model_name)
def test_shape_on_real_data_batched(self): set_seed(42) src_vocab_size = 17 tgt_vocab_size = 23 max_position = 7 encoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=src_vocab_size, num_hidden_layers=1, num_attention_heads=1, ) encoder = transformers.BertModel(encoder_config) decoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=tgt_vocab_size + max_position, is_decoder=True, num_hidden_layers=1, num_attention_heads=1, ) decoder = transformers.BertModel(decoder_config) model = EncoderDecoderWPointerModel(encoder=encoder, decoder=decoder, max_src_len=max_position) # similar to real data src_seq = torch.LongTensor([[1, 6, 12, 15, 2, 0, 0], [1, 6, 12, 15, 5, 3, 2]]) tgt_seq = torch.LongTensor([ [8, 6, 4, 10, 11, 8, 5, 1, 12, 7, 7, 0, 0], [8, 6, 4, 10, 11, 8, 5, 1, 12, 13, 14, 7, 7], ]) mask = torch.FloatTensor([[0, 1, 1, 1, 0, 0, 0], [0, 1, 1, 1, 1, 1, 0]]) combined_logits = model(input_ids=src_seq, decoder_input_ids=tgt_seq, pointer_mask=mask)[0] expected_shape = (2, tgt_seq.shape[1], tgt_vocab_size + src_seq.shape[1]) self.assertEqual(combined_logits.shape, expected_shape)
def test_shape_on_real_data(self): set_seed(42) src_vocab_size = 17 tgt_vocab_size = 23 max_position = 5 encoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=src_vocab_size, num_hidden_layers=1, num_attention_heads=1, ) encoder = transformers.BertModel(encoder_config) decoder_config = transformers.BertConfig( hidden_size=11, intermediate_size=44, vocab_size=tgt_vocab_size + max_position, is_decoder=True, num_hidden_layers=1, num_attention_heads=1, ) decoder = transformers.BertModel(decoder_config) model = EncoderDecoderWPointerModel(encoder=encoder, decoder=decoder, max_src_len=max_position) # similar to real data # e.g. '[CLS] Directions to Lowell [SEP]' src_seq = torch.LongTensor([[1, 6, 12, 15, 2]]) # e.g. '[IN:GET_DIRECTIONS Directions to [SL:DESTINATION Lowell]]' tgt_seq = torch.LongTensor([[8, 6, 4, 10, 11, 8, 5, 1, 12, 7, 7]]) mask = torch.FloatTensor([[0, 1, 1, 1, 0]]) combined_logits = model(input_ids=src_seq, decoder_input_ids=tgt_seq, pointer_mask=mask)[0] expected_shape = (1, tgt_seq.shape[1], tgt_vocab_size + src_seq.shape[1]) self.assertEqual(combined_logits.shape, expected_shape)
def test_run_with_no_exception(self): # Arrange train_data_file = os.path.join(os.path.dirname(__file__), "sample_sst2.csv") tempdir = tempfile.mkdtemp() batch = 3 # Bert Config vocab_size = 20000 sequence_len = 20 num_classes = 14 bert_config = transformers.BertConfig(vocab_size=vocab_size, hidden_size=10, num_hidden_layers=1, num_attention_heads=1, num_labels=num_classes) # Mock tokenisor mock_tokenisor = MagicMock() mock_tokenisor.tokenize.side_effect = lambda x: x.split(" ") mock_tokenisor.convert_tokens_to_ids = lambda x: [ i for i, _ in enumerate(x) ] # Builder b = Builder(train_data=train_data_file, val_data=train_data_file, dataset_factory_name= "datasets.sst2_dataset_factory.SST2DatasetFactory", checkpoint_dir=tempdir, epochs=2, grad_accumulation_steps=1, early_stopping_patience=2, batch_size=batch, max_seq_len=sequence_len, model_dir=tempdir) b.set_tokensior(mock_tokenisor) b.set_bert_config(bert_config) trainer = b.get_trainer() # Get data loaders train_dataloader, val_dataloader = b.get_train_val_dataloader() # Act # Run training trainer.run_train(train_iter=train_dataloader, validation_iter=val_dataloader, model_network=b.get_network(), loss_function=b.get_loss_function(), optimizer=b.get_optimiser(), pos_label=b.get_pos_label_index())
def test_smart_batch(use_cuda: bool): test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = transformers.BertConfig(attention_probs_dropout_prob=0.0, hidden_dropout_prob=0.0) torch_model = transformers.BertModel(cfg) # model_id = "bert-base-uncased" # torch_model = transformers.BertModel.from_pretrained(model_id) torch_model.eval() torch_model.to(test_device) torch.set_grad_enabled(False) cfg = torch_model.config # use 4 threads for computing if not use_cuda: turbo_transformers.set_num_threads(4) # Initialize a turbo BertModel with smart batching from torch model. turbo_model = turbo_transformers.BertModelSmartBatch.from_torch( torch_model) # a batch of queries with different lengths. query_seq_len_list = [18, 2, 3, 51] input_list = [] # generate random inputs. Of course you can use real data. for query_seq_len in query_seq_len_list: input_seq = torch.randint(low=0, high=cfg.vocab_size - 1, size=(1, query_seq_len), dtype=torch.long, device=test_device) input_list.append(input_seq) # start inference s_res = serial_bert_inference(torch_model, input_list) b_res = batch_bert_inference(turbo_model, input_list, query_seq_len_list) print(torch.max(torch.abs(b_res - s_res))) assert (torch.max(torch.abs(b_res - s_res)) < 1e-2) start_time = time.time() for i in range(10): serial_bert_inference(torch_model, input_list) end_time = time.time() print("\ntorch time consum: {}".format(end_time - start_time)) start_time = time.time() for i in range(10): batch_bert_inference(turbo_model, input_list, query_seq_len_list) end_time = time.time() print("\nturbo time consum: {}".format(end_time - start_time))
def init_model(num): configuration = transformers.BertConfig("./model/config.json") model_class = transformers.BertForSequenceClassification tokenizer_class = transformers.BertTokenizer if num == 1: pretrained_weights = './model/model1/' idx_to_token = { 0: "[no connective]", 1: "and", 2: "by contrast", 3: "by then", 4: "finally", 5: "for example", 6: "however", 7: "in other words", 8: "in particular", 9: "indeed", 10: "instead", 11: "meanwhile", 12: "moreover", 13: "nevertheless", 14: "on the other hand", 15: "otherwise", 16: "overall", 17: "rather", 18: "then", 19: "therefore" } else: pretrained_weights = './model/model2/' idx_to_token = { 0: "[no connective]", 1: "although", 2: "and", 3: "because", 4: "but", 5: "for example", 6: "however", 7: "or", 8: "so", 9: "so that", 10: "unless", 11: "while" } pretrained_tokenizer = './tokenizer' # load weights for tokenizer and model tokenizer = tokenizer_class.from_pretrained(pretrained_tokenizer) model = model_class.from_pretrained(pretrained_weights, ) model.eval() return model, tokenizer, idx_to_token
def __post_init__(self): assert self.size in ["small", "large"] if self.size == "small": self.hidden_size = 256 self.num_attention_heads = 4 self.num_hidden_layers = 2 self.model_config = transformers.BertConfig( hidden_size=self.hidden_size, num_attention_heads=self.num_attention_heads, num_hidden_layers=self.num_hidden_layers, max_position_embeddings=self.max_position_embeddings, )
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') if use_gpu: print("using GPU") else: print("using CPU") cfg = None torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") turbo_transformers.set_num_threads(num_threads) if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "turbo", num_threads, cfg) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "turbo", num_threads)
def __init__(self, decode_fn): super(BleurtEmbDiscriminator, self).__init__() self.decode_fn = decode_fn from BleurtTorch import BleurtModel import transformers checkpoint = "bleurt/bleurt-base-128-torch.pb" config = transformers.BertConfig() self.bleurt_model = BleurtModel(config) self.bleurt_model.load_state_dict(torch.load(checkpoint)) self.bleurt_model.eval() self.tokenizer = transformers.BertTokenizerFast.from_pretrained( "bert-base-uncased") from BleurtTorch import encode_batch self.encode_batch = encode_batch
def test_wikipedia_dataset(): args = "--config demo_tiny_128".split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) config.vocab_size = 30522 config.input_files = ["data/wikipedia/128/wiki_000.tfrecord"] num_tokens = 0 replacement_counts = Counter({"103": 0, "same": 0, "random": 0}) dataset = get_dataset(config) opts = get_options(config) loader = DataLoader(opts, dataset, batch_size=config.batch_size, num_workers=config.dataloader_workers) for datum in tqdm(loader): tokens, attn_mask, types, mask_lm_pos, labels, nsp = datum tokens = tokens.numpy() attn_mask = attn_mask.numpy() types = types.numpy() mask_lm_pos = mask_lm_pos.numpy() labels = labels.numpy() nsp = nsp.numpy() for b in range(config.batch_size): check_dimensions(config, tokens[b], attn_mask[b], types[b], mask_lm_pos[b], labels[b], nsp[b]) check_tokens(config, tokens[b], mask_lm_pos[b], labels[b]) check_attention_mask(attn_mask[b], tokens[b]) check_mask_lm_positions(config, mask_lm_pos[b]) check_labels(config, tokens[b], mask_lm_pos[b], labels[b]) check_token_type(types[b]) check_nsp(nsp[b]) replacement_counts += mask_type_count(tokens[b], mask_lm_pos[b], labels[b]) # Number of tokens, not including padding num_tokens += attn_mask[b, attn_mask[b] == 1].shape[0] # Test masked token proportions total = sum(replacement_counts.values()) for k in replacement_counts: replacement_counts[k] /= total assert (0.79 < replacement_counts["103"] < 0.81) assert (0.09 < replacement_counts["same"] < 0.11) assert (0.09 < replacement_counts["random"] < 0.11) assert (0.14 < total / num_tokens < 0.16) # should be ~0.15
def benchmark_torch_jit(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import transformers import contexttimer import torch.jit torch.set_num_threads(num_threads) torch.set_grad_enabled(False) if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long) model = torch.jit.trace(model, (input_ids, )) with torch.jit.optimized_execution(True): model(input_ids) with contexttimer.Timer() as t: for _ in range(n): model(input_ids) print( json.dumps({ "QPS": n / t.elapsed, "elapsed": t.elapsed, "n": n, "batch_size": batch_size, "seq_len": seq_len, "framework": "torch_jit", "n_threads": num_threads, "model_name": model_name }))
def __init__(self, vocab: nnlp.Vocab, n_embd: int = 256, n_layer: int = 4, n_head: int = 4, n_position: int = 128, n_ctx: int = 128): super(BERTLMModel, self).__init__() config = transformers.BertConfig(vocab_size=len(vocab), hidden_size=n_embd, num_hidden_layers=n_layer, num_attention_heads=n_head, output_hidden_states=True) self.bert_model = transformers.BertForMaskedLM(config) self.vocab = vocab self.mlm_probability = 0.15
def test(use_cuda: bool): test_device_name = "GPU" if use_cuda else "CPU" test_device = torch.device('cuda:0') if use_cuda else \ torch.device('cpu:0') cfg = transformers.BertConfig() # use 4 threads for computing turbo_transformers.set_num_threads(4) input_ids = np.array( ([12166, 10699, 16752, 4454], [5342, 16471, 817, 16022]), dtype=np.int64) segment_ids = np.array(([1, 1, 1, 0], [1, 0, 0, 0]), dtype=np.int64) input_ids_tensor = turbo_transformers.nparray2tensor( input_ids, test_device_name) segment_ids_tensor = turbo_transformers.nparray2tensor( segment_ids, test_device_name) # 3. load model from npz if len(sys.argv) == 2: try: print(sys.argv[1]) in_file = sys.argv[1] except: sys.exit("ERROR. can not open ", sys.argv[1]) else: in_file = "/home/jiaruifang/codes/TurboTransformers/bert.npz" # 255 MiB tt_model = turbo_transformers.BertModel.from_npz(in_file, cfg, test_device) # 1169 MiB start_time = time.time() for _ in range(10): res = tt_model(input_ids_tensor, token_type_ids=segment_ids_tensor, return_type=turbo_transformers.ReturnType.NUMPY ) # sequence_output, pooled_output end_time = time.time() print("turbo bert sequence output:", res[0][:, 0, :]) print("turbo bert pooler output: ", res[1]) # pooled_output print("\nturbo time consum: {}".format(end_time - start_time))
def benchmark_torch(model_name: str, seq_len: int, batch_size: int, n: int, enable_random: bool, max_seq_len: int, min_seq_len: int, num_threads: int, use_gpu: bool, enable_mem_opt: bool): import torch import transformers import benchmark_helper test_device = torch.device('cuda:0') if use_gpu else torch.device('cpu:0') torch.set_grad_enabled(False) torch.set_num_threads(num_threads) cfg = None if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) elif model_name == "distilbert": cfg = transformers.DistilBertConfig() model = transformers.DistilBertModel(cfg) else: raise (f"benchmark does not support {model_name}") model.eval() model.to(test_device) # cfg = model.config # type: transformers.BertConfig if enable_random: benchmark_helper.run_variable_model(model, use_gpu, n, max_seq_len, min_seq_len, "torch", num_threads, cfg, enable_mem_opt, model_name) else: input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), use_gpu, n, batch_size, seq_len, "torch", num_threads, enable_mem_opt, model_name)
def benchmark_turbo_transformers(model_name: str, seq_len: int, batch_size: int, n: int): import torch import transformers import contexttimer import turbo_transformers import benchmark_helper if not torch.cuda.is_available(): print("cuda is not available for torch") return test_device = torch.device('cuda:0') if model_name == "bert": cfg = transformers.BertConfig() model = transformers.BertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.BertModel.from_torch(model) elif model_name == "albert": cfg = transformers.AlbertConfig() model = transformers.AlbertModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.AlbertModel.from_torch(model) elif model_name == "roberta": cfg = transformers.RobertaConfig() model = transformers.RobertaModel(cfg) model.to(test_device) model.eval() model = turbo_transformers.RobertaModel.from_torch(model) else: raise (f"benchmark does not support {model_name}") cfg = model.config # type: transformers.BertConfig input_ids = torch.randint(low=0, high=cfg.vocab_size - 1, size=(batch_size, seq_len), dtype=torch.long, device=test_device) benchmark_helper.run_model(lambda: model(input_ids), True, n, batch_size, seq_len, "turbo")
def build_ner_biobert_model(hparams): # data_dir = pathlib.Path(DATA_DIR) exp_dirpath = pathlib.Path(dirname) train_filename = "./ner/%s/%s.train"%(hparams.get("dataset"), hparams.get("dataset")) dev_filename = "./ner/%s/%s.dev"%(hparams.get("dataset"), hparams.get("dataset")) test_filename = "./ner/%s/%s.test"%(hparams.get("dataset"), hparams.get("dataset")) data_manager = BioNERDatasetManager( train_filename=train_filename, dev_filename=dev_filename, test_filename=test_filename, column_names=["NER"], train_only="ner", ) config = transformers.BertConfig(output_hidden_states=True, vocab_size=28996) model = AutoModelWithLMHead.from_pretrained("monologg/biobert_v1.1_pubmed", config=config) tokenizer = AutoTokenizer.from_pretrained("monologg/biobert_v1.1_pubmed") # TODO: Specifying the max length biobert2seqencoder = Biobert2SeqEncoder( tokenizer=tokenizer, model=model, device=torch.device(hparams.get("device")), ) model = RnnSeqCrfTagger( rnn2seqencoder=biobert2seqencoder, encoding_dim=768, device=torch.device(hparams.get("device")), datasets_manager=data_manager, ) infer = SequenceLabellingInference( model=model, model_filepath=str(exp_dirpath.joinpath("checkpoints", "best_model.pt")), datasets_manager=data_manager, ) return infer
def __init__(self, p): super().__init__() self.bert_config = transformers.BertConfig( vocab_size=1, hidden_size=p.input_dim, num_hidden_layers=p.n_layers, num_attention_heads=p.n_heads, intermediate_size=p.ffn_dim, hidden_dropout_prob=p.dropout_p, attention_probs_dropout_prob=p.dropout_p, max_position_embeddings=p.seq_length, position_embedding_type='absolute' ) self.bert = BertModel(self.bert_config) self.conv = nn.Conv2d(1, 1, kernel_size=(1,p.input_dim)) self.fc = nn.Linear(in_features=p.input_dim, out_features=1) self.relu = nn.ReLU() self.last_layer = p.last_layer
def __init__(self, hp: Optional[ModelParams] = ModelParams()): super().__init__() self.hp = hp config = transformers.BertConfig( hidden_size=self.hp.dim, num_hidden_layers=0, num_attention_heads=1, intermediate_size=0, max_position_embeddings=self.hp.max_seq_len, output_attentions=False, output_hidden_states=False, return_dict=True, ) self.e = transformers.BertModel(config, add_pooling_layer=False) for name, param in self.e.named_parameters(): if name == 'position_embeddings': requires_grad = False else: requires_grad = self.hp.requires_grad param.requires_grad = requires_grad
def test_constant_lrschedule(): """ Test that lr schedule "constant" results in unchanging LR """ import warnings warnings.filterwarnings("ignore", category=torch.jit.TracerWarning) args = """ --config unit_test --lr-schedule constant """.split() config = transformers.BertConfig(**(vars(parse_bert_args(args)))) opts = get_options(config) # IPU Model and Optimizer model = PipelinedBertWithLoss(config).half().train() optimizer = get_optimizer(config, model) scheduler = get_lr_scheduler(optimizer, "constant") poptorch_model = poptorch.trainingModel(model, opts, optimizer=optimizer) def mock_data(): return get_generated_datum(config) # Compile the model poptorch_model.compile(*mock_data()) # Starting lr should be 1.0 assert poptorch_model._dict_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate # Run for some steps for _ in range(5): outputs = poptorch_model(*mock_data()) scheduler.step() poptorch_model.setOptimizer(optimizer) # LR should be unchanged assert poptorch_model._dict_new_optimizer["groups"][0]["learningRate"][ 0] == config.learning_rate