Esempio n. 1
0
        def create_and_check_lm_head_model(self, config, input_ids, head_mask,
                                           token_type_ids, *args):
            model = OpenAIGPTLMHeadModel(config)
            model.eval()

            loss, lm_logits = model(input_ids,
                                    token_type_ids=token_type_ids,
                                    labels=input_ids)

            result = {"loss": loss, "lm_logits": lm_logits}

            self.parent.assertListEqual(list(result["loss"].size()), [])
            self.parent.assertListEqual(
                list(result["lm_logits"].size()),
                [self.batch_size, self.seq_length, self.vocab_size])
    def test_lm_generate_openai_gpt(self):
        model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
        input_ids = torch.Tensor([[481, 2585, 544,
                                   4957]]).long()  # The dog is cute
        expected_output_ids = [
            481,
            2585,
            544,
            4957,
            669,
            512,
            761,
            5990,
            271,
            645,
            487,
            535,
            976,
            2479,
            240,
            487,
            804,
            1296,
            2891,
            512,
        ]  # the dog is cute when you're annoyed : if he's really stupid, he 'll stop fighting you
        torch.manual_seed(0)

        output_ids = model.generate(input_ids)
        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
Esempio n. 3
0
def reset_bot():
    global history, tokenizer, model, personality
    dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json'
    dataset_cache = './chatapp/dataset_cache'
    model_checkpoint = download_pretrained_model()
    device = "cpu"
    seed = random.randrange(0, 100)
    random.seed(seed)
    torch.random.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    # Get pretrained model and tokenizer
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint)
    model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint)
    model.to(device)
    add_special_tokens_(model, tokenizer)

    # Sample a personality
    dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
    personalities = [
        dialog["personality"] for dataset in dataset.values()
        for dialog in dataset
    ]
    personality = random.choice(personalities)

    history = []
    return ""
    def test_lm_generate_openai_gpt(self):
        model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt")
        model.to(torch_device)
        input_ids = torch.tensor([[481, 4735, 544]], dtype=torch.long, device=torch_device)  # the president is
        expected_output_ids = [
            481,
            4735,
            544,
            246,
            963,
            870,
            762,
            239,
            244,
            40477,
            244,
            249,
            719,
            881,
            487,
            544,
            240,
            244,
            603,
            481,
        ]  # the president is a very good man. " \n " i\'m sure he is, " said the

        output_ids = model.generate(input_ids, do_sample=False)
        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
Esempio n. 5
0
    def __init__(self, bot):
        self.bot = bot
        self.src_dir = os.path.abspath(
            os.path.join(os.path.dirname(__file__), '..'))

        model_path = os.path.join(self.src_dir, "conv_ai/model/")
        self.args = {
            "max_history": 2,
            "device": "cpu",
            "max_length": 20,
            "min_length": 1,
            "temperature": 0.7,
            "top_k": 0,
            "top_p": 0.9,
            "no_sample": 1
        }
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path)
        self.model = OpenAIGPTLMHeadModel.from_pretrained(model_path)
        self.model.to('cpu')
        add_special_tokens_(self.model, self.tokenizer)
        dataset = get_dataset(
            self.tokenizer, "",
            os.path.join(self.src_dir, "conv_ai/dataset_cache"))

        self.personalities = [
            dialog["personality"] for dataset in dataset.values()
            for dialog in dataset
        ]
        self.personality = random.choice(self.personalities)

        self.history = []
        print("Conversational AI model loaded successfully.")
    def create_and_check_lm_head_model(self, config, input_ids, head_mask, token_type_ids, *args):
        model = OpenAIGPTLMHeadModel(config)
        model.to(torch_device)
        model.eval()

        result = model(input_ids, token_type_ids=token_type_ids, labels=input_ids)
        self.parent.assertEqual(result.loss.shape, ())
        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
def get_gpt2_perplexity(sentence):
    global model
    if model is None:
        from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
        import torch
        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
        model.eval()
        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

    tokenize_input = tokenizer.tokenize(sentence)
    tensor_input = torch.tensor(
        [tokenizer.convert_tokens_to_ids(tokenize_input)])
    loss = model(tensor_input, lm_labels=tensor_input)
    return math.exp(loss[0].item())
Esempio n. 8
0
    def __init__(self):
        self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
        self.gpt = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda()
        self.embedder = SentenceTransformer('bert-base-nli-mean-tokens').cuda()
        self.pos_phrase = "I have an undiagnosed disease. "

        self.keywords = [term.strip().lower() for term in open('tweet_crawler/terms.txt').read().split('\n')
                         if term != "" and term != "undiagnosed" and term != "disease"]

        self.udn_examples = list(open('data/UDN_patient_search_TWEET_samples.txt').read().split('\n')) + \
                            list(open('data/UDN_patient_search_WEB_samples.txt').read().split('\n'))

        # self.phrase_gpt_score = gpt_log_prob_score([self.phrase], self.gpt, self.tokenizer)
        self.pos_phrase_emb = self.embedder.encode([self.pos_phrase])[0]
Esempio n. 9
0
    triggers = {row[0]: row[1] for row in csv.reader(triggers_file)}

max_history = 2
min_length, max_length = 1, 20
dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json'
dataset_cache = './chatapp/dataset_cache'
model_checkpoint = download_pretrained_model()
device = "cpu"
seed = 0
random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

# Get pretrained model and tokenizer
tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint)
model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint)
model.to(device)
add_special_tokens_(model, tokenizer)

# Sample a personality
dataset = get_dataset(tokenizer, dataset_path, dataset_cache)
personalities = [
    dialog["personality"] for dataset in dataset.values() for dialog in dataset
]
personality = random.choice(personalities)

history = []


@app.route("/")
def home():
Esempio n. 10
0
				with open(cached_input_file, "rb") as reader:
					eval_inputs = pickle.load(reader)
					reader.close()
			except:
				eval_inputs = read_data(args.data_dir + 'dev.txt', length)
				if args.local_rank == -1:
					logger.info("  Saving eval features into cached file %s", cached_input_file)
					with open(cached_input_file, "wb") as writer:
						pickle.dump(eval_inputs, writer)
						writer.close()

			eval_dataloader = DataLoader(eval_inputs, sampler = SequentialSampler(eval_inputs),
			                             batch_size = args.eval_batch_size)

			# Set model
			model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
			model.to(device)

			# Train model
			logger.info("***** Run training and evaluating *****")
			logger.info("  Num of train examples = %d", len(train_dataloader))
			logger.info("  Train batch size = %d", args.train_batch_size)
			logger.info("  Num of eval examples = %d", len(eval_dataloader))
			logger.info("  Eval batch size = %d", args.eval_batch_size)
			model.train()

			num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs // args.train_batch_size

			# Prepare optimizer and schedule (linear warmup and decay)
			no_decay = ["bias", "LayerNorm.weight"]
			optimizer_grouped_parameters = [
import math
import time
import json
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
#!pip install transformers
#!pip install ftfy
#!pip install spacy
from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer, AdamW
%load_ext tensorboard
from torch.utils.tensorboard import SummaryWriter

writer = SummaryWriter('runs/BaselineModel')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').to(device)
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

def add_special_tokens_(model, tokenizer):
    """ Add special tokens to the tokenizer and the model if they have not already been added. """
    orig_num_tokens = len(tokenizer.encoder)
    num_added_tokens = tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) # doesn't add if they are already there
    if num_added_tokens > 0:
        model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens)

SPECIAL_TOKENS = ["<bos>", "<eos>", "<system>", "<user>", "<slots>", "<pad>"]
ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<system>', '<user>', '<slots>']}
MODEL_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
PADDED_INPUTS = ["input_ids", "lm_labels", "token_type_ids"]
Esempio n. 12
0
        Input = Input.long().to(self.device)
        Output = self.Trans(Input,attention_mask=attn_mask)
        logits = Output[0]
        labels = Input
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1)
        ids = nonzero(flatten_shift_loss_mask).view(-1)
        fin_logits,fin_labels = shift_logits.view(-1, shift_logits.size(-1))[ids], shift_labels.view(-1)[ids]
        return fin_logits,fin_labels
    def decode(self,Input,Label,max_length):
        """decode the given input probabilities with search probabilities"""
        attn_mask = tensor(Label.clone().detach() == 1.0,dtype=uint8,device=self.device)
        Input = Input.long().to(self.device)
        Output = self.Trans.generate(Input,attention_mask=attn_mask,max_length=max_length)
        return Output
      

if __name__ == "__main__":
    Trans_Config = OpenAIGPTConfig(vocab_size=3002,n_layer=12)
    Trans_Model = OpenAIGPTLMHeadModel(Trans_Config,)
    Token_Dir = r"G:\Work Related\Nlc2cmd\Tokenizer_Train\GPTToken/"
    Trans_Tok = GPT2TokenizerFast.from_pretrained(Token_Dir)
    Omni = OmniBash(Trans_Model,"cpu")
    Dataset = OmnibashDataset(r"G:\Work Related\Nlc2cmd\Data\Template.json",Trans_Tok,"train",100)
    TrainLoader = DataLoader(Dataset,batch_size=10)
    Sample = next(iter(TrainLoader))
    X = Omni.decode(Sample[0][0].unsqueeze(0),Sample[1][0].unsqueeze(0))
    print(X)
    Out = Trans_Tok.convert_ids_to_tokens(X[0])
    print(Out)
Esempio n. 13
0
from .helpers import *
from .models import ElmoSCLSTM
from .util import get_module_or_attr

""" NEW: reranking snippets """
# (GPT/GPT-2/CTRL/Transformer-XL/XLNet)
import torch
from torch.nn import CrossEntropyLoss

HFACE_batch_size = 8
RERANKER = "GPT-2"  # GPT/GPT-2/CTRL/Transformer-XL/XLNet
if RERANKER == "GPT":
    from transformers import OpenAIGPTTokenizer, OpenAIGPTLMHeadModel

    gpt2Tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
    gpt2LMHeadModel = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
    gpt2Tokenizer.add_special_tokens({'pad_token': "[PAD]"})
    gpt2LMHeadModel.resize_token_embeddings(len(gpt2Tokenizer))
    assert gpt2Tokenizer.pad_token == '[PAD]'
elif "GPT-2":
    from transformers import GPT2Tokenizer, GPT2LMHeadModel

    gpt2Tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
    gpt2LMHeadModel = GPT2LMHeadModel.from_pretrained('gpt2-medium')
    gpt2Tokenizer.pad_token = gpt2Tokenizer.eos_token
elif "Transformer-XL":
    from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel

    txlTokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
    txlLMHeadModel = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
    txlTokenizer.pad_token = txlTokenizer.eos_token
Esempio n. 14
0
def load_model(name: str) -> Tuple[OpenAIGPTLMHeadModel, OpenAIGPTTokenizer]:
    model = OpenAIGPTLMHeadModel.from_pretrained(name)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(name)
    model.eval()
    return model, tokenizer
def setup_gpt(model_name="openai-gpt"):
    model = OpenAIGPTLMHeadModel.from_pretrained(model_name)
    tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name)
    return model, tokenizer