def create_and_check_attention_mask_determinism(self, config, input_ids,
                                                    token_type_ids, input_mask,
                                                    sequence_labels,
                                                    token_labels,
                                                    choice_labels):
        model = LongformerModel(config=config)
        model.to(torch_device)
        model.eval()

        attention_mask = torch.ones(input_ids.shape,
                                    dtype=torch.long,
                                    device=torch_device)
        output_with_mask = model(
            input_ids, attention_mask=attention_mask)["last_hidden_state"]
        output_without_mask = model(input_ids)["last_hidden_state"]
        self.parent.assertTrue(
            torch.allclose(output_with_mask[0, 0, :5],
                           output_without_mask[0, 0, :5],
                           atol=1e-4))
    def create_and_check_model_with_global_attention_mask(
        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
    ):
        model = LongformerModel(config=config)
        model.to(torch_device)
        model.eval()
        global_attention_mask = input_mask.clone()
        global_attention_mask[:, input_mask.shape[-1] // 2] = 0
        global_attention_mask = global_attention_mask.to(torch_device)

        result = model(
            input_ids,
            attention_mask=input_mask,
            global_attention_mask=global_attention_mask,
            token_type_ids=token_type_ids,
        )
        result = model(input_ids, token_type_ids=token_type_ids, global_attention_mask=global_attention_mask)
        result = model(input_ids, global_attention_mask=global_attention_mask)

        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
        self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
    def create_and_check_longformer_model(self, config, input_ids,
                                          token_type_ids, input_mask,
                                          sequence_labels, token_labels,
                                          choice_labels):
        model = LongformerModel(config=config)
        model.to(torch_device)
        model.eval()
        sequence_output, pooled_output = model(input_ids,
                                               attention_mask=input_mask,
                                               token_type_ids=token_type_ids)
        sequence_output, pooled_output = model(input_ids,
                                               token_type_ids=token_type_ids)
        sequence_output, pooled_output = model(input_ids)

        result = {
            "sequence_output": sequence_output,
            "pooled_output": pooled_output,
        }
        self.parent.assertListEqual(
            list(result["sequence_output"].size()),
            [self.batch_size, self.seq_length, self.hidden_size])
        self.parent.assertListEqual(list(result["pooled_output"].size()),
                                    [self.batch_size, self.hidden_size])
Ejemplo n.º 4
0
 def __init__(self, model_name: str = "allenai/longformer-base-4096"):
     self.model = LongformerModel.from_pretrained(model_name)
     self.tokenizer = LongformerTokenizer.from_pretrained(model_name)
 def __init__(self, config, project_dim: int = 0, seq_project=True):
     LongformerModel.__init__(self, config)
     assert config.hidden_size > 0, 'Encoder hidden_size can\'t be zero'
     self.encode_proj = nn.Linear(config.hidden_size, project_dim) if project_dim != 0 else None
     self.seq_project = seq_project
     self.init_weights()
Ejemplo n.º 6
0
print(df.target.value_counts())

import torch
from transformers import DistilBertTokenizerFast, DistilBertModel, DistilBertConfig
from transformers import LongformerTokenizerFast, LongformerModel, LongformerConfig

#model_name = 'distilbert-base-uncased'
model_name = 'allenai/longformer-base-4096'
tokenizer = LongformerTokenizerFast.from_pretrained(model_name)

df["vecs"] = df.text.map(
    lambda x: torch.LongTensor(tokenizer.encode(x)).unsqueeze(0))

config = LongformerConfig.from_pretrained(model_name,
                                          output_hidden_states=True)
model = LongformerModel.from_pretrained(model_name, config=config)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device = 'cpu'

model = model.to(device)
input_tf = tokenizer.batch_encode_plus(df.text.to_list(),
                                       return_tensors='pt',
                                       padding=True)
#vecs = input_tf['input_ids'].to(device)
#granola_ids = granola_ids.to(device)

model.eval()

with torch.no_grad():
    print("and GO!!!!")
Ejemplo n.º 7
0
from transformers import ElectraForMaskedLM, ElectraTokenizer
tokenizer = ElectraTokenizer.from_pretrained('google/electra-small-generator')
model = ElectraForMaskedLM.from_pretrained('google/electra-small-generator')

input_ids = torch.tensor(
    tokenizer.encode("Hello, my dog is cute",
                     add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, masked_lm_labels=input_ids)

loss, prediction_scores = outputs[:2]
print(prediction_scores)

## Longformer
from transformers import LongformerModel, LongformerTokenizer

model = LongformerModel.from_pretrained('longformer-base-4096')
tokenizer = LongformerTokenizer.from_pretrained('longformer-base-4096')

SAMPLE_TEXT = ' '.join(['Hello world! '] * 1000)  # long input document
input_ids = torch.tensor(tokenizer.encode(SAMPLE_TEXT)).unsqueeze(
    0)  # batch of size 1

# Attention mask values -- 0: no attention, 1: local attention, 2: global attention
attention_mask = torch.ones(
    input_ids.shape, dtype=torch.long,
    device=input_ids.device)  # initialize to local attention
attention_mask[:, [
    1,
    4,
    21,
]] = 2  # Set global attention based on the task. For example,
Ejemplo n.º 8
0
def test_all(args):
    # Currently, the longformer attention operator could only run in GPU (no CPU implementation yet).
    device = torch.device('cuda:0')

    results = []
    for model_name in args.models:
        # Here we run an example input
        from transformers import LongformerModel
        torch_model_name_or_dir = PRETRAINED_LONGFORMER_MODELS[model_name]
        model = LongformerModel.from_pretrained(
            torch_model_name_or_dir)  # pretrained model name or directory
        model.to(device)

        # Search onnx model in the following order: optimized fp16 model, optimized fp32 model, raw model
        # TODO: call convert_longformer_to_onnx to export onnx instead.
        import os.path
        optimized = False
        precision = 'fp32'
        onnx_model_path = os.path.join(args.onnx_dir, model_name + ".onnx")
        optimized_fp32_model = os.path.join(args.onnx_dir,
                                            model_name + "_fp32.onnx")
        optimized_fp16_model = os.path.join(args.onnx_dir,
                                            model_name + "_fp16.onnx")
        if os.path.isfile(optimized_fp16_model):
            onnx_model_path = optimized_fp16_model
            optimized = True
            precision = 'fp16'
        elif os.path.isfile(optimized_fp32_model):
            onnx_model_path = optimized_fp32_model
            optimized = True
        print("ONNX model path:", onnx_model_path)

        for num_threads in args.num_threads:
            if "torch" in args.engines:
                results += test_torch_latency(device, model, model_name,
                                              args.batch_sizes,
                                              args.sequence_lengths,
                                              args.global_lengths,
                                              args.test_times, num_threads,
                                              args.verbose)

            if "onnxruntime" in args.engines:
                if args.memory:
                    test_ort_memory(device, onnx_model_path,
                                    args.batch_sizes[0],
                                    args.sequence_lengths[0],
                                    args.global_lengths[0], args.test_times,
                                    num_threads)
                else:  # test latency
                    session = benchmark_helper.create_onnxruntime_session(
                        onnx_model_path,
                        use_gpu=True,
                        enable_all_optimization=True,
                        num_threads=num_threads)
                    if session is None:
                        raise RuntimeError(
                            f"Failed to create ORT sesssion from ONNX file {onnx_model_path}"
                        )

                    results += test_ort_latency(
                        device, model, model_name, session, args.batch_sizes,
                        args.sequence_lengths, args.global_lengths,
                        args.test_times, num_threads, optimized, precision,
                        args.validate_onnx, args.disable_io_binding,
                        args.verbose)
    return results
Ejemplo n.º 9
0
 def __init__(self):
     super(Model, self).__init__()
     self.model = LongformerModel.from_pretrained(model_config.pretrain_model_path, gradient_checkpointing=True)
     self.config = self.model.config
     self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
     self.classifier = nn.Linear(self.config.hidden_size, config.num_labels)
Ejemplo n.º 10
0
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

import gensim
from torch import nn as nn

from config import SBERT_MODEL_NAME
from utils.types import FolkLoreData, FolkLoreEmb, FolkLoreEmbCoarse

nlp = en_core_web_sm.load()
sbert_model = SentenceTransformer(SBERT_MODEL_NAME)

from transformers import LongformerModel, LongformerTokenizerFast, LongformerConfig

LFconfig = LongformerConfig.from_pretrained('allenai/longformer-base-4096')
LF_model = LongformerModel.from_pretrained('allenai/longformer-base-4096',
                                           config=LFconfig)
LF_tokenizer = LongformerTokenizerFast.from_pretrained(
    'allenai/longformer-base-4096')
LF_tokenizer.model_max_length = LF_model.config.max_position_embeddings


class MatrixVectorScaledDotProductAttention(nn.Module):
    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, q, k, v, mask=None):
        """
		q: tensor of shape (n*b, d_k)
Ejemplo n.º 11
0
 def __init__(self, config):
     super(LongformerQA, self).__init__(config)
     self.longformer = LongformerModel(config)
     self.qa_outputs = torch.nn.Linear(config.hidden_size,
                                       config.num_labels)
     self.init_weights()
Ejemplo n.º 12
0
    def __init__(self, config_path):
        config = configparser.ConfigParser()
        config.read(config_path)

        self.n_epoch = config.getint("general", "n_epoch")
        self.batch_size = config.getint("general", "batch_size")
        self.train_bert = config.getboolean("general", "train_bert")
        self.lr = config.getfloat("general", "lr")
        self.cut_frac = config.getfloat("general", "cut_frac")
        self.log_dir = Path(config.get("general", "log_dir"))
        if not self.log_dir.exists():
            self.log_dir.mkdir(parents=True)
        self.model_save_freq = config.getint("general", "model_save_freq")

        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # bert_config_path = config.get("bert", "config_path")
        # bert_tokenizer_path = config.get("bert", "tokenizer_path")
        # bert_model_path = config.get("bert", "model_path")

        self.bert_tokenizer = LongformerTokenizer.from_pretrained(
            'allenai/longformer-base-4096')
        # self.bert_tokenizer = BertTokenizer.from_pretrained(bert_tokenizer_path)
        tkzer_save_dir = self.log_dir / "tokenizer"
        if not tkzer_save_dir.exists():
            tkzer_save_dir.mkdir()
        self.bert_tokenizer.save_pretrained(tkzer_save_dir)
        self.bert_model = LongformerModel.from_pretrained(
            'allenai/longformer-base-4096')
        self.bert_config = self.bert_model.config
        # self.bert_config = BertConfig.from_pretrained(bert_config_path)
        # self.bert_model = BertModel.from_pretrained(bert_model_path, config=self.bert_config)
        self.max_seq_length = self.bert_config.max_position_embeddings - 2
        # self.max_seq_length = self.bert_config.max_position_embeddings
        self.bert_model.to(self.device)

        if self.train_bert:
            self.bert_model.train()
        else:
            self.bert_model.eval()

        train_conll_path = config.get("data", "train_path")
        print("train path", train_conll_path)
        assert Path(train_conll_path).exists()
        dev_conll_path = config.get("data", "dev_path")
        print("dev path", dev_conll_path)
        assert Path(dev_conll_path).exists()
        dev1_conll_path = Path(dev_conll_path) / "1"
        print("dev1 path", dev1_conll_path)
        assert dev1_conll_path.exists()
        dev2_conll_path = Path(dev_conll_path) / "2"
        print("dev2 path", dev2_conll_path)
        assert dev2_conll_path.exists()
        self.train_dataset = ConllDataset(train_conll_path)
        # self.dev_dataset = ConllDataset(dev_conll_path)
        self.dev1_dataset = ConllDataset(dev1_conll_path)
        self.dev2_dataset = ConllDataset(dev2_conll_path)
        if self.batch_size == -1:
            self.batch_size = len(self.train_dataset)

        self.scaler = torch.cuda.amp.GradScaler()
        tb_cmt = f"lr_{self.lr}_cut-frac_{self.cut_frac}"
        self.writer = SummaryWriter(log_dir=self.log_dir, comment=tb_cmt)
Ejemplo n.º 13
0
    def __init__(self, task_configs=[],
                 device='cpu',
                 finetuning=True,
                 lm='bert',
                 bert_pt=None,
                 bert_path=None):
        super().__init__()

        assert len(task_configs) > 0

        # load the model or model checkpoint
        if bert_path == None:
            if lm == 'bert':
                self.bert = BertModel.from_pretrained(model_ckpts[lm])
            elif lm == 'distilbert':
                self.bert = DistilBertModel.from_pretrained(model_ckpts[lm])
            elif lm == 'albert':
                self.bert = AlbertModel.from_pretrained(model_ckpts[lm])
            elif lm == 'xlnet':
                self.bert = XLNetModel.from_pretrained(model_ckpts[lm])
            elif lm == 'roberta':
                self.bert = RobertaModel.from_pretrained(model_ckpts[lm])
            elif lm == 'longformer':
                self.bert = LongformerModel.from_pretrained(model_ckpts[lm])
        else:
            output_model_file = bert_path
            model_state_dict = torch.load(output_model_file,
                                          map_location=lambda storage, loc: storage)
            if lm == 'bert':
                self.bert = BertModel.from_pretrained(model_ckpts[lm],
                        state_dict=model_state_dict)
            elif lm == 'distilbert':
                self.bert = DistilBertModel.from_pretrained(model_ckpts[lm],
                        state_dict=model_state_dict)
            elif lm == 'albert':
                self.bert = AlbertModel.from_pretrained(model_ckpts[lm],
                        state_dict=model_state_dict)
            elif lm == 'xlnet':
                self.bert = XLNetModel.from_pretrained(model_ckpts[lm],
                        state_dict=model_state_dict)
            elif lm == 'roberta':
                self.bert = RobertaModel.from_pretrained(model_ckpts[lm],
                        state_dict=model_state_dict)

        self.device = device
        self.finetuning = finetuning
        self.task_configs = task_configs
        self.module_dict = nn.ModuleDict({})
        self.lm = lm

        # hard corded for now
        hidden_size = 768
        hidden_dropout_prob = 0.1

        for config in task_configs:
            name = config['name']
            task_type = config['task_type']
            vocab = config['vocab']

            if task_type == 'tagging':
                # for tagging
                vocab_size = len(vocab) # 'O' and '<PAD>'
                if 'O' not in vocab:
                    vocab_size += 1
                if '<PAD>' not in vocab:
                    vocab_size += 1
            else:
                # for pairing and classification
                vocab_size = len(vocab)

            self.module_dict['%s_dropout' % name] = nn.Dropout(hidden_dropout_prob)
            self.module_dict['%s_fc' % name] = nn.Linear(hidden_size, vocab_size)
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long,
                                device=input_ids.device)  # TODO: use random word ID. #TODO: simulate masked word
    global_attention_mask = torch.zeros(input_ids.shape, dtype=torch.long, device=input_ids.device)
    if num_global_tokens > 0:
        global_token_index = list(range(num_global_tokens))
        global_attention_mask[:, global_token_index] = 1
    # TODO: support more inputs like token_type_ids, position_ids
    return input_ids, attention_mask, global_attention_mask

args = parse_arguments()

model_name = args.model
onnx_model_path = model_name + ".onnx"

from transformers import LongformerModel
model = LongformerModel.from_pretrained(MODELS[model_name]) # pretrained model name or directory

input_ids, attention_mask, global_attention_mask = get_dummy_inputs(sequence_length=args.sequence_length, num_global_tokens=args.global_length, device=torch.device('cpu'))

example_outputs = model(input_ids, attention_mask=attention_mask, global_attention_mask=global_attention_mask)

# A new function to replace LongformerSelfAttention.forward
#For transformers 4.0
def my_longformer_self_attention_forward_4(self, hidden_states, attention_mask=None, is_index_masked=None, is_index_global_attn=None, is_global_attn=None):
    # TODO: move mask calculation to LongFormerModel class to avoid calculating it again and again in each layer.
    global_mask = is_index_global_attn.int()
    torch.masked_fill(attention_mask, is_index_global_attn, 0.0)

    weight = torch.stack((self.query.weight.transpose(0,1), self.key.weight.transpose(0,1), self.value.weight.transpose(0,1)), dim=1)
    weight = weight.reshape(self.embed_dim, 3*self.embed_dim)
 def __init__(self, config):
     super().__init__(config)
     self.longformer = LongformerModel(config)
     self.init_weights()
Ejemplo n.º 16
0
    def test_layer_attn_probs(self):
        model = LongformerModel.from_pretrained(
            "patrickvonplaten/longformer-random-tiny")
        model.eval()
        layer = model.encoder.layer[0].attention.self.to(torch_device)
        hidden_states = torch.cat(
            [self._get_hidden_states(),
             self._get_hidden_states() - 0.5],
            dim=0)
        batch_size, seq_length, hidden_size = hidden_states.size()
        attention_mask = torch.zeros((batch_size, seq_length),
                                     dtype=torch.float32,
                                     device=torch_device)

        # create attn mask
        attention_mask[0, -2:] = 10000.0
        attention_mask[0, -1:] = -10000.0
        attention_mask[1, 1:] = 10000.0

        is_index_masked = attention_mask < 0
        is_index_global_attn = attention_mask > 0
        is_global_attn = is_index_global_attn.flatten().any().item()

        output_hidden_states, local_attentions, global_attentions = layer(
            hidden_states,
            attention_mask=attention_mask,
            is_index_masked=is_index_masked,
            is_index_global_attn=is_index_global_attn,
            is_global_attn=is_global_attn,
            output_attentions=True,
        )

        self.assertEqual(local_attentions.shape, (2, 4, 2, 8))
        self.assertEqual(global_attentions.shape, (2, 2, 3, 4))

        # All tokens with global attention have weight 0 in local attentions.
        self.assertTrue(torch.all(local_attentions[0, 2:4, :, :] == 0))
        self.assertTrue(torch.all(local_attentions[1, 1:4, :, :] == 0))

        # The weight of all tokens with local attention must sum to 1.
        self.assertTrue(
            torch.all(
                torch.abs(global_attentions[0, :, :2, :].sum(dim=-1) -
                          1) < 1e-6))
        self.assertTrue(
            torch.all(
                torch.abs(global_attentions[1, :, :1, :].sum(dim=-1) -
                          1) < 1e-6))

        self.assertTrue(
            torch.allclose(
                local_attentions[0, 0, 0, :],
                torch.tensor(
                    [
                        0.3328, 0.0000, 0.0000, 0.0000, 0.0000, 0.3355, 0.3318,
                        0.0000
                    ],
                    dtype=torch.float32,
                    device=torch_device,
                ),
                atol=1e-3,
            ))

        self.assertTrue(
            torch.allclose(
                local_attentions[1, 0, 0, :],
                torch.tensor(
                    [
                        0.2492, 0.2502, 0.2502, 0.0000, 0.0000, 0.2505, 0.0000,
                        0.0000
                    ],
                    dtype=torch.float32,
                    device=torch_device,
                ),
                atol=1e-3,
            ))

        # All the global attention weights must sum to 1.
        self.assertTrue(
            torch.all(torch.abs(global_attentions.sum(dim=-1) - 1) < 1e-6))

        self.assertTrue(
            torch.allclose(
                global_attentions[0, 0, 1, :],
                torch.tensor(
                    [0.2500, 0.2500, 0.2500, 0.2500],
                    dtype=torch.float32,
                    device=torch_device,
                ),
                atol=1e-3,
            ))

        self.assertTrue(
            torch.allclose(
                global_attentions[1, 0, 0, :],
                torch.tensor(
                    [0.2497, 0.2500, 0.2499, 0.2504],
                    dtype=torch.float32,
                    device=torch_device,
                ),
                atol=1e-3,
            ))
Ejemplo n.º 17
0
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels

        self.roberta = LongformerModel(config)
        self.classifier = RobertaClassificationHead(config)