def __init__(
        self,
        pretrained_model_name_or_path: str = "gpt2-medium",
        layers: str = "1",
        pooling_operation: str = "first_last",
        use_scalar_mix: bool = False,
    ):
        """OpenAI GPT-2 embeddings, as proposed in Radford et al. 2019.
        :param pretrained_model_name_or_path: name or path of OpenAI GPT-2 model
        :param layers: comma-separated list of layers
        :param pooling_operation: defines pooling operation for subwords
        :param use_scalar_mix: defines the usage of scalar mix for specified layer(s)
        """
        super().__init__()

        self.tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path)
        self.model = GPT2Model.from_pretrained(
            pretrained_model_name_or_path=pretrained_model_name_or_path, output_hidden_states=True
        )
        self.name = pretrained_model_name_or_path
        self.layers: List[int] = [int(layer) for layer in layers.split(",")]
        self.pooling_operation = pooling_operation
        self.use_scalar_mix = use_scalar_mix
        self.static_embeddings = True

        dummy_sentence: Sentence = Sentence()
        dummy_sentence.add_token(Token("hello"))
        embedded_dummy = self.embed(dummy_sentence)
        self.__embedding_length: int = len(
            embedded_dummy[0].get_token(1).get_embedding()
        )
Example #2
0
    def __init__(self, config):
        super(GPT2ClassificationModel, self).__init__(config)
        self.transformer = GPT2Model(config)
        self.classifier1 = torch.nn.Linear(config.n_embd, config.num_labels)
        self.dropout = torch.nn.Dropout(config.summary_first_dropout)
        self.loss_fct = torch.nn.CrossEntropyLoss()

        self.init_weights()
Example #3
0
    def __init__(self, model_path):
        super(OnmtGPT2Encoder, self).__init__()
        config = GPT2Config.from_json_file(
            os.path.join(model_path, "config.json"))
        pretrained_dict = os.path.join(model_path, "pytorch_model.bin")
        if os.path.exists(pretrained_dict):
            model = GPT2Model.from_pretrained(
                pretrained_model_name_or_path=pretrained_dict, config=config)
            print("init GPT2 model with {} weights".format(
                len(model.state_dict())))
        else:
            model = GPT2Model(config)

        model.wte = expandEmbeddingByN(model.wte, 4)
        self.encoder = model

        #print(model)
        print("***" * 20)
Example #4
0
    def __init__(self, config):
        super(AblationLongGPT2, self).__init__(config)
        self.sequence_len = config.sequence_len
        
        self.transformer = GPT2Model(config)
        self.dropout = torch.nn.Dropout(config.summary_first_dropout)
        self.classifier1 = torch.nn.Linear(config.n_embd * self.sequence_len, config.num_labels)

        self.loss_fct = torch.nn.CrossEntropyLoss()
        self.init_weights()
Example #5
0
    def __init__(self, config):
        super(GPT2ClassHeadsModel, self).__init__(config)
        self.transformer = GPT2Model(config)
        
        self.classifier = nn.Linear(config.n_embd, 2)
        # self.classifier = nn.Sequential(nn.Linear(config.n_embd, 768), nn.ReLU(), nn.Dropout(p=0.2),
        #                                 nn.Linear(768, 2))
        # self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)

        self.init_weights()
Example #6
0
 def __init__(self, chunck_size=64, max_length=35, device=torch.device('cuda:0')):
     super(GPT2Client, self).__init__()
     self.chunck_size = chunck_size
     self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
     self.max_length = max_length
     # load the model
     self.model = GPT2Model.from_pretrained('gpt2')
     self.model.eval()
     self.device = device
     # move model to device
     self.model.to(self.device)
 def __init__(self, vocab_size, device):       
     super().__init__()
     
     self.hidden_size = 768
     self.gpt2model = GPT2Model.from_pretrained('gpt2')
     self.gpt2model.resize_token_embeddings(vocab_size)
     
     for param in self.gpt2model.parameters():
         param.requires_grad = False
     
     self.device = device
     self.to(device)
 def __init__(self, config):
     super(GPT2_adverse, self).__init__(config)
     self.transformer = GPT2Model(config)
     self.lm_head = nn.Linear(2 * config.n_embd,
                              config.vocab_size,
                              bias=False)
     self.pos_head_norm = nn.Linear(config.n_embd,
                                    config.pos_vocab_size,
                                    bias=True)
     self.pos_head_adv = nn.Linear(config.n_embd,
                                   config.pos_vocab_size,
                                   bias=True)
     self.syn_layer = nn.Linear(config.n_embd, config.n_embd, bias=True)
     self.sem_layer = nn.Linear(config.n_embd, config.n_embd, bias=True)
     self.apply(self.init_weights)
Example #9
0
    def __init__(self, config,):
        super(AttentionLongGPT2, self).__init__(config)
        self.sequence_len = config.sequence_len
        self.transformer = GPT2Model(config)
        
        self.dropout = torch.nn.Dropout(config.summary_first_dropout)
        
        self.classifier1 = torch.nn.Linear(config.n_embd * 9, config.num_labels)
        self.attention1 = torch.nn.Linear(self.sequence_len, 64)
        self.attention2 = torch.nn.Linear(64, 128)
        self.attention3 = torch.nn.Linear(128 + config.n_embd, 2*config.n_embd)
        self.leaky = torch.nn.LeakyReLU(0.2)

        self.att = NewAttention(config.n_embd)

        self.loss_fct = torch.nn.CrossEntropyLoss()
        self.init_weights()
        def create_and_check_gpt2_model(self, config, input_ids, head_mask,
                                        token_type_ids, *args):
            model = GPT2Model(config=config)
            model.eval()

            model(input_ids,
                  token_type_ids=token_type_ids,
                  head_mask=head_mask)
            model(input_ids, token_type_ids=token_type_ids)
            sequence_output, presents = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "presents": presents,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertEqual(len(result["presents"]), config.n_layer)
Example #11
0
    def __init__(self, gpt2_model, language, name, loi, cuda=False):
        super(GPT2, self).__init__()
        # Load pre-trained model tokenizer (vocabulary)
        # Crucially, do not do basic tokenization; PTB is tokenized. Just do wordpiece tokenization.
        if gpt2_model not in ['small', 'medium']:
            raise ValueError("GPT2 model must be small or medium")
        self.model = GPT2Model.from_pretrained(
            'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'),
            output_hidden_states=True)
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            'gpt2{}'.format('' if gpt2_model == 'small' else '-medium'))

        self.language = language
        self.LAYER_COUNT = parameters[gpt2_model]['LAYER_COUNT']
        self.FEATURE_COUNT = parameters[gpt2_model]['FEATURE_COUNT']
        self.name = name
        self.loi = np.array(loi) if loi else np.arange(
            parameters[gpt2_model]['LAYER_COUNT'])  # loi: layers of interest
        self.cuda = cuda
Example #12
0
def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file,
                                       pytorch_dump_folder_path):
    # Construct model
    if gpt2_config_file == "":
        config = GPT2Config()
    else:
        config = GPT2Config.from_json_file(gpt2_config_file)
    model = GPT2Model(config)

    # Load weights from numpy
    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
    torch.save(model.state_dict(), pytorch_weights_dump_path)
    print("Save configuration file to {}".format(pytorch_config_dump_path))
    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
        f.write(config.to_json_string())
Example #13
0
import torch
import torch.nn as nn
from pytorch_transformers import GPT2Model, GPT2LMHeadModel, GPT2Tokenizer
from torchbench.language_modelling import WikiText103

new_model = GPT2Model.from_pretrained('gpt2-large')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-large')


def model_output_transform(output, target, model):
    n_embd = 1280
    vocab_size = 50257
    lm_head = nn.Linear(n_embd, vocab_size, bias=False).cuda()
    if model.config.torchscript:
        lm_head.weight = nn.Parameter(transformer.wte.weight.clone())
    else:
        lm_head.weight = model.wte.weight
    hidden_states = output[0]
    lm_logits = lm_head(hidden_states)
    return lm_logits


WikiText103.benchmark(
    model=new_model,
    context_length=1024,
    encoder=tokenizer,
    model_output_transform=model_output_transform,
    paper_model_name='GPT-2 Large',
    paper_pwc_id='language-models-are-unsupervised-multitask')
Example #14
0
def test_gpt2_embeddings():
    gpt_model: str = "gpt2-medium"

    tokenizer = GPT2Tokenizer.from_pretrained(gpt_model)
    model = GPT2Model.from_pretrained(pretrained_model_name_or_path=gpt_model,
                                      output_hidden_states=True)
    model.to(flair.device)
    model.eval()

    s: str = "Berlin and Munich have a lot of puppeteer to see ."

    with torch.no_grad():
        tokens = tokenizer.tokenize("<|endoftext|>" + s + "<|endoftext|>")

        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)

        hidden_states = model(tokens_tensor)[-1]

        first_layer = hidden_states[1][0]

    assert len(first_layer) == len(tokens)

    #         0           1      2       3        4         5       6      7      8       9      10     11     12     13    14          15
    #
    #  '<|endoftext|>', 'Ber', 'lin', 'Ġand', 'ĠMunich', 'Ġhave', 'Ġa', 'Ġlot', 'Ġof', 'Ġpupp', 'ete', 'er', 'Ġto', 'Ġsee', 'Ġ.', '<|endoftext|>'
    #                      \     /       |        |         |       |      |      |         \      |      /     |      |      |
    #                       Berlin      and    Munich     have      a     lot     of           puppeteer        to    see     .
    #
    #                         0          1        2         3       4      5       6               7             8     9      10

    def embed_sentence(
        sentence: str,
        pooling_operation,
        layers: str = "1",
        use_scalar_mix: bool = False,
    ) -> Sentence:
        embeddings = OpenAIGPT2Embeddings(
            model=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix,
        )
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)

        return flair_sentence

    # First subword embedding
    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation="first")

    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)

    # Last subword embedding
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation="last")

    # First token is splitted into two subwords.
    # As we use "last" as pooling operation, we consider the last subword as "first token" here
    first_token_embedding_ref = first_layer[2].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_last_subword_embedding_ref = first_layer[11].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)

    # First and last subword embedding
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation="first_last")

    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[2]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()

    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[9], first_layer[11]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)

    # Mean of all subword embeddings
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation="mean")

    first_token_embedding_ref = calculate_mean_embedding(
        [first_layer[1], first_layer[2]]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()

    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[9], first_layer[10], first_layer[11]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()

    assert first_token_embedding_ref == first_token_embedding_actual
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)

    # Check embedding dimension when using multiple layers
    sentence_mult_layers = embed_sentence(sentence="Munich",
                                          pooling_operation="first",
                                          layers="1,2,3,4")

    ref_embedding_size = 4 * 1024
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size

    # Check embedding dimension when using multiple layers and scalar mix
    sentence_mult_layers_scalar_mix = embed_sentence(
        sentence="Berlin",
        pooling_operation="first",
        layers="1,2,3,4",
        use_scalar_mix=True,
    )

    ref_embedding_size = 1 * 1024
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)

    assert ref_embedding_size == actual_embedding_size
import csv
from random import shuffle

import numpy as np

from sklearn.linear_model import LogisticRegressionCV, RidgeClassifierCV
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import VectorizerMixin

import torch
from pytorch_transformers import GPT2Tokenizer, GPT2Model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2Model.from_pretrained('gpt2-medium')

import eli5
from eli5.lime import TextExplainer

positives = []
negatives = []
rowcutoff = 10000

with open('bset_automl_2.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    index = -1
    for line in csv_reader:
        # skipping header row
        index += 1
        if index > 0:
Example #16
0
# Load pre-trained model tokenizer (vocabulary)
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Encode some inputs
text_1 = "Who was Jim Henson ?"
text_2 = "Jim Henson was a puppeteer"
indexed_tokens_1 = tokenizer.encode(text_1)
indexed_tokens_2 = tokenizer.encode(text_2)

# Convert inputs to PyTorch tensors
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])

# Load pre-trained model (weights)
model = GPT2Model.from_pretrained('gpt2')
model.eval()

# If you have a GPU, put everything on cuda
tokens_tensor_1 = tokens_tensor_1.to('cuda')
tokens_tensor_2 = tokens_tensor_2.to('cuda')
model.to('cuda')

# Predict hidden states features for each layer
with torch.no_grad():
    hidden_states_1, past = model(tokens_tensor_1)
    print(len(hidden_states_1))
    print(hidden_states_1[-1].size())
    # past can be used to reuse precomputed hidden state in a subsequent predictions
    # (see beam-search examples in the run_gpt2.py example).
    # hidden_states_2, past = model(tokens_tensor_2, past=past)
Example #17
0
print(tokenized_text_1)
print(indexed_tokens1)
print(tokens_tensor_1)

print(tokenized_text_2)
print(indexed_tokens2)
print(tokens_tensor_2)
"""
print("Encode:")
text = "What is the fastest car in the "
indexed_tokens = tokenizer.encode(text)
# Convert indexed tokens in a PyTorch tensor
tokens_tensor = torch.tensor([indexed_tokens])
print(indexed_tokens)
"""
encoder = GPT2Model.from_pretrained('gpt2')
with torch.no_grad():
    last_hidden_states_1, past = encoder(tokens_tensor_1)
print(last_hidden_states_1.size())

with torch.no_grad():
    last_hidden_states_2, past = encoder(tokens_tensor_2)
print(last_hidden_states_2.size())

# Load pre-trained model (weights)
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
# Set the model in evaluation mode to deactivate the DropOut modules
model.eval()

# Predict all tokens
 def test_model_from_pretrained(self):
     cache_dir = "/tmp/pytorch_transformers_test/"
     for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
         model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir)
         shutil.rmtree(cache_dir)
         self.assertIsNotNone(model)
import torch
from pytorch_transformers import GPT2Model
import numpy as np

sequence_length = 3

input_sequence = torch.tensor(np.zeros(sequence_length),
                              dtype=torch.long).unsqueeze(0)

GPT2Model.from_pretrained('gpt2')(input_sequence)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--model_name", type=str, help="pretrained_model.")
    parser.add_argument("--model_option", type=str, default='gpt-2-2', help="pretrained_model.")
    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
    parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.")
    parser.add_argument("--do_probe", action='store_true', help="Whether to run probing.")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument('--data_dir', type=str, default='/home/xiongyi/dataxyz/repos/SemSynLSTM/word_language_model/data/wikitext-2/')
    parser.add_argument('--seed', type=int, default=12)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument('--warmup_proportion', type=float, default=0.002)
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    timenow = datetime.datetime.now().strftime("%b%d%H%M")
    model_option = 'gpt_2_2'
    outdir = model_option + timenow
    args = parser.parse_args(['--output_dir', outdir,'--do_probe','--num_train_epochs', '10', '--model_option',model_option])
    #args = parser.parse_args(['--output_dir', './tmp', '--do_eval', '--model_name', 'gpt2'])
    print(args)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))


    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    #special_tokens = ['_start_', '_delimiter_']
    #special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)

    # Compute the max input length for the Transformer
    input_length = 128
    data_dir = '../SemSynLSTM/word_language_model/data/wikitext-2/' if args.data_dir is None else args.data_dir
    train_set, val_set, test_set, dictionary, pos_dictionary = load_tokenize_and_batchify(data_dir, input_length)

    # Prepare inputs tensors and dataloaders

    train_data = TensorDataset(*train_set)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=32)

    eval_data = TensorDataset(*val_set)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=32)

    config = GPT2Config(n_positions=256,n_ctx=256, n_layer=8,n_head=8, n_embd= 256)
    config.vocab_size = dictionary.__len__()
    config.pos_vocab_size = pos_dictionary.__len__()
    config.n_ctx = input_length
    config.n_positions = input_length
    model1 = GPT2Model(config=config)

    #TODO: GPTWithPOSPredicting
    model2 = GPT2Model(config=config)

    #TODO: Wrapp2Transformers together and add a LM head
    model = WrapperLMHead(model1, model2, config, args.model_option)
    model.to(device)

    # TODO: Load and encode the datasets

    logger.info("Encoding dataset...")

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          #max_grad_norm=args.max_grad_norm,
                          weight_decay=args.weight_decay)
                          #t_total=num_train_optimization_steps)

    if args.do_train:
        train_results = {}
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            ###eval on eval set
            model.eval()
            nb_eval_steps, nb_eval_examples = 0, 0
            log_probs_sum = 0
            perp = 0.0
            average_loss = np.array([0.0,0.0,0.0])
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch

                with torch.no_grad():
                    loss, loss_lm, loss_pos = model(input_ids, pos_ids = input_pos_ids, labels=input_ids)[0]
                    loss = loss.detach().cpu().numpy()
                    loss_lm = loss_lm.detach().cpu().numpy()
                    loss_pos = loss_pos.detach().cpu().numpy()
                    perp_batch = np.exp(loss_lm)
                    perp += perp_batch
                    average_loss += np.array([loss, loss_lm, loss_pos])
                nb_eval_steps += 1
            perp /= nb_eval_steps
            average_loss /= nb_eval_steps
            print('loss, loss_lm, loss_pos', average_loss,'perp ', perp, 'epoch ', epoch)
            train_results[epoch]= (perp, average_loss)

            model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch
                loss = model(input_ids, labels=input_ids, pos_ids = input_pos_ids)[0][0]
                #breakpoint()
                #loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} ".format(exp_average_loss)

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        #tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = GPT2LMHeadModel.from_pretrained(args.output_dir)
        #tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)
        print (train_results)
    if args.do_eval:
        model.eval()
        nb_eval_steps, nb_eval_examples = 0, 0
        log_probs_sum=0
        perp = 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_pos_ids = batch

            with torch.no_grad():
                loss = model(input_ids, labels= input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy()
                perp_batch = np.exp(loss)
                perp += perp_batch
            nb_eval_steps += 1

        perp /= nb_eval_steps
        # perp_word = perp / 128
        print (perp)
        result = {'eval_perp': perp}
        logger.info("***** Eval results *****")
        logger.info("'eval_perp' = %s", str(result['eval_perp']))
        # output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        # with open(output_eval_file, "w") as writer:
        #     logger.info("***** Eval results *****")
        #     for key in sorted(result.keys()):
        #         logger.info("  %s = %s", key, str(result[key]))
        #         writer.write("%s = %s\n" % (key, str(result[key])))

    if args.do_probe:

        ##load model (how???)
        model_path = '/home/xiongyi/dataxyz/repos/pytorch-pretrained-BERT/examples/gpt2_2_jul22/pytorch_model.bin_double'
        model.load_state_dict(torch.load(model_path))
        ##Add a mlp to the representation

        probe_model = ProbeModel(model, config)
        probe_model.to(device)
        ##train and eval
        all_param = list(probe_model.named_parameters())
        param_probe = [(n, p) for n, p in all_param if 'probe_cls' in n]
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_probe if not any(nd in n for nd in no_decay)],
             'weight_decay': 0.01},
            {'params': [p for n, p in param_probe if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate,
                          # max_grad_norm=args.max_grad_norm,
                          weight_decay=args.weight_decay)
        # t_total=num_train_optimization_steps)
        train_results = {}
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            ###eval on eval set
            probe_model.eval()
            nb_eval_steps, nb_eval_examples = 0, 0
            average_loss = 0
            average_acc = 0
            for batch in tqdm(eval_dataloader, desc="Evaluating"):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch

                with torch.no_grad():
                    #breakpoint()
                    loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0].detach().cpu().numpy()
                    pos_logits = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[1].detach().cpu().numpy()
                    predicted_labels = np.argmax(pos_logits, -1)
                    correct_rate = np.mean(predicted_labels == input_pos_ids.detach().cpu().numpy()[:,1:])
                    average_acc += correct_rate
                    average_loss += loss
                nb_eval_steps += 1
            average_loss /= nb_eval_steps
            average_acc /= nb_eval_steps
            print('loss', average_loss,' acc_rate ', average_acc, ' epoch ', epoch)
            train_results[epoch] = (average_loss, average_acc)

            probe_model.train()

            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_pos_ids = batch
                loss = probe_model(input_ids, labels=input_ids, pos_ids=input_pos_ids)[0]
                # breakpoint()
                # loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e}".format(exp_average_loss)
from pytorch_transformers import GPT2Model, GPT2Tokenizer
import numpy as np
import torch
import math
import torch.nn as nn

sequence_length = 3

model = GPT2Model.from_pretrained("gpt2")

input_ids = torch.tensor(np.zeros(sequence_length), dtype=torch.long)
position_ids = torch.tensor(np.arange(sequence_length).astype(np.float), dtype=torch.long)

# Output of the embeddings addition
embeddings = model.wpe(position_ids) + model.wte(input_ids)

# Output of the first Attention LayerNorm layer
ln_1 = model.h[0].ln_1(embeddings)

# Output of the attention dense layer for Q, K, V
c_attn = model.h[0].attn.c_attn(ln_1).reshape((-1, sequence_length, 2304))

# Splitting the QKV vector
query, key, value = c_attn.split(model.h[0].attn.split_size, dim=2)

# Splitting the heads
split_query, split_key, split_value = model.h[0].attn.split_heads(query), model.h[0].attn.split_heads(key, k=True), model.h[0].attn.split_heads(value)

# QK Matmul
w = torch.matmul(split_query, split_key)
Example #22
0
def test_gpt2_embeddings():
    gpt_model = 'gpt2-medium'
    tokenizer = GPT2Tokenizer.from_pretrained(gpt_model)
    model = GPT2Model.from_pretrained(pretrained_model_name_or_path=gpt_model,
                                      output_hidden_states=True)
    model.to(flair.device)
    model.eval()
    s = 'Berlin and Munich have a lot of puppeteer to see .'
    with torch.no_grad():
        tokens = tokenizer.tokenize((('<|endoftext|>' + s) + '<|endoftext|>'))
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
        tokens_tensor = torch.tensor([indexed_tokens])
        tokens_tensor = tokens_tensor.to(flair.device)
        hidden_states = model(tokens_tensor)[(-1)]
        first_layer = hidden_states[1][0]
    assert (len(first_layer) == len(tokens))

    def embed_sentence(sentence: str,
                       pooling_operation,
                       layers: str = '1',
                       use_scalar_mix: bool = False) -> Sentence:
        embeddings = OpenAIGPT2Embeddings(
            pretrained_model_name_or_path=gpt_model,
            layers=layers,
            pooling_operation=pooling_operation,
            use_scalar_mix=use_scalar_mix)
        flair_sentence = Sentence(sentence)
        embeddings.embed(flair_sentence)
        return flair_sentence

    sentence_first_subword = embed_sentence(sentence=s,
                                            pooling_operation='first')
    first_token_embedding_ref = first_layer[1].tolist()
    first_token_embedding_actual = sentence_first_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_subword_embedding_ref = first_layer[9].tolist()
    puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_subword_embedding_ref ==
            puppeteer_first_subword_embedding_actual)
    sentence_last_subword = embed_sentence(sentence=s,
                                           pooling_operation='last')
    first_token_embedding_ref = first_layer[2].tolist()
    first_token_embedding_actual = sentence_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_last_subword_embedding_ref = first_layer[11].tolist()
    puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_last_subword_embedding_ref ==
            puppeteer_last_subword_embedding_actual)
    sentence_first_last_subword = embed_sentence(
        sentence=s, pooling_operation='first_last')
    first_token_embedding_ref = torch.cat([first_layer[1],
                                           first_layer[2]]).tolist()
    first_token_embedding_actual = sentence_first_last_subword.tokens[
        0].embedding.tolist()
    puppeteer_first_last_subword_embedding_ref = torch.cat(
        [first_layer[9], first_layer[11]]).tolist()
    puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_first_last_subword_embedding_ref ==
            puppeteer_first_last_subword_embedding_actual)
    sentence_mean_subword = embed_sentence(sentence=s,
                                           pooling_operation='mean')
    first_token_embedding_ref = calculate_mean_embedding(
        [first_layer[1], first_layer[2]]).tolist()
    first_token_embedding_actual = sentence_mean_subword.tokens[
        0].embedding.tolist()
    puppeteer_mean_subword_embedding_ref = calculate_mean_embedding(
        [first_layer[9], first_layer[10], first_layer[11]]).tolist()
    puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[
        7].embedding.tolist()
    assert (first_token_embedding_ref == first_token_embedding_actual)
    assert (puppeteer_mean_subword_embedding_ref ==
            puppeteer_mean_subword_embedding_actual)
    sentence_mult_layers = embed_sentence(sentence='Munich',
                                          pooling_operation='first',
                                          layers='1,2,3,4')
    ref_embedding_size = (4 * 1024)
    actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)
    sentence_mult_layers_scalar_mix = embed_sentence(sentence='Berlin',
                                                     pooling_operation='first',
                                                     layers='1,2,3,4',
                                                     use_scalar_mix=True)
    ref_embedding_size = (1 * 1024)
    actual_embedding_size = len(
        sentence_mult_layers_scalar_mix.tokens[0].embedding)
    assert (ref_embedding_size == actual_embedding_size)