def __init__(self, model, num_steps, num_classes=2):
     super(GPT2Classifier, self).__init__()
     self.tokenizer = GPT2Tokenizer.from_pretrained(model)
     self.tokenizer.pad_token = self.tokenizer.eos_token
     self.encoder = GPT2ForSequenceClassification.from_pretrained(model)
     self.encoder.config.pad_token_id = self.tokenizer.eos_token_id
     self.num_steps = num_steps
    def makeUnilabelModel(self, modelName, num_labels=10, root='', **kwargs):
        if modelName == 'distilbert-base-uncased':
            tokenizer = DistilBertTokenizerFast.from_pretrained(
                'distilbert-base-uncased')
            model = DistilBertForSequenceClassification.from_pretrained(
                root + "distilbert-base-uncased",
                num_labels=num_labels,
                **kwargs)
        if modelName == 'gpt2':
            tokenizer = GPT2TokenizerFast.from_pretrained('gpt2')
            tokenizer.add_special_tokens({'pad_token': '[PAD]'})
            model = GPT2ForSequenceClassification.from_pretrained(
                root + "gpt2", num_labels=num_labels, **kwargs)
            model.resize_token_embeddings(len(tokenizer))
            # add padding token
            model.config.pad_token_id = tokenizer('[PAD]').input_ids[0]
        if modelName == 'bertweet':
            tokenizer = AutoTokenizer.from_pretrained('vinai/bertweet-base')
            model = AutoModelForSequenceClassification.from_pretrained(
                root + "vinai/bertweet-base", num_labels=num_labels, **kwargs)
        if modelName == 'distilroberta-base':
            tokenizer = AutoTokenizer.from_pretrained('distilroberta-base')
            model = AutoModelForSequenceClassification.from_pretrained(
                root + "distilroberta-base", num_labels=num_labels, **kwargs)
        if modelName == 'lstm':
            tokenizer = AutoTokenizer.from_pretrained(
                'distilbert-base-uncased')
            model = LSTMCclassifier(128, 64, 2, tokenizer.vocab_size,
                                    num_labels)

        return tokenizer, model
Esempio n. 3
0
 def __init__(self, conf):
     super(GPT2Classifier, self).__init__()
     self.conf = conf
     self.padding = 0
     self.gpt_path = getattr(conf, 'bert_path', None)
     pretrain_name = 'gpt2'
     if self.gpt_path:
         pretrain_name = self.gpt_path
     print('GPT Model from {}'.format(pretrain_name))
     self.gpt = GPT2ForSequenceClassification.from_pretrained(
         pretrain_name, num_labels=conf.class_num)
     self.gpt.config.pad_token_id = self.gpt.config.eos_token_id
     self.drop = nn.Dropout(p=0.3)
Esempio n. 4
0
    def __init__(self):

        # Look for gpu to use. Will use `cpu` by default if no gpu found.
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')
        # device = torch.device('cpu')
        print("Device: ", self.device)
        _ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        model_path = os.path.join(_ROOT, "Models", "GPT2_Model", "model")
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            pretrained_model_name_or_path=model_path)
        self.gpt_model = GPT2ForSequenceClassification.from_pretrained(
            pretrained_model_name_or_path=model_path)
        self.gpt_model.eval()
Esempio n. 5
0
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
import torch


tokenizer = GPT2Tokenizer.from_pretrained('microsoft/dialogrpt')
model = GPT2ForSequenceClassification.from_pretrained('microsoft/dialogrpt')
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(**inputs, labels=labels)
loss = outputs.loss
logits = outputs.logits
Esempio n. 6
0
        validation_dataloader = torch.load(val_loader_fn)
        train_dataloader = torch.load(train_loader_fn)
    else:
        validation_dataloader, train_dataloader = create_train_val_loaders(
            config['dataset']['train'], val_loader_fn, train_loader_fn,
            tokenizer, batch_size, max_length)

    if os.path.exists(test_loader_fn) and (not build_new_dataloaders):
        pass
    else:
        create_test_loader(config['dataset']['test'], test_loader_fn,
                           tokenizer, batch_size, max_length)

    #  import pretrained model
    if model_name == "gpt2":
        model = GPT2ForSequenceClassification.from_pretrained(
            "gpt2", num_labels=NUM_LABELS)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(
            config['model'][model_name], num_labels=NUM_LABELS)

    if cpu:
        # Distributor = torch.nn.parallel.DistributedDataParallelCPU
        # import torch.distributed as dist
        # rank=1
        # world_size=12
        # dist.init_process_group("gloo", world_size=world_size,rank=-1, store= None)
        # parallel_model = Distributor(model)
        parallel_model = model
    else:
        parallel_model = torch.nn.DataParallel(model)  # Encapsulate the model
        parallel_model.cuda()
)

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
    val_dataset,  # The validation samples.
    sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
    batch_size=batch_size  # Evaluate with this batch size.
)

# Load GPT2ForSequenceClassification, the pretrained GPT2 model with a single
# linear classification layer on top.
model_config = GPT2Config.from_pretrained('gpt2', num_labels=2)

# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained('gpt2',
                                                      config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Run this model on GPU.
model.cuda()

optimizer = AdamW(
    model.parameters(),
    lr=
    2e-5,  # args.learning_rate - default is 5e-5, paper recommends 5e-5/3e-5/2e-5
    eps=1e-8  # args.adam_epsilon  - default is 1e-8.
Esempio n. 8
0
import numpy as np
start_debugger_on_exception()
train_dataset = DataSetBert(data_file= './data/data_train/train.csv')
val_dataset = DataSetBert(data_file= './data/data_train/val.csv')
test_dataset = DataSetBert(data_file= './data/data_train/test.csv')
from torch.utils.data import DataLoader
device = torch.device('cuda:1') 
train_dataloader = DataLoader(train_dataset, batch_size=11, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=11, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=11, shuffle=True)
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('uer/gpt2-chinese-cluecorpussmall')
tokenizer.add_special_tokens({"additional_special_tokens": ["[P]"]})
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path='uer/gpt2-chinese-cluecorpussmall', num_labels=2)
model_config.num_labels = 15
model =GPT2ForSequenceClassification.from_pretrained('uer/gpt2-chinese-cluecorpussmall',config = model_config)
model.resize_token_embeddings(len(tokenizer))
model.config.pad_token_id = model.config.eos_token_id
model.config.n_positions = 1024
assert model.config.num_labels == 15
model.to(device)  
model.train()
model.to(device)  
import pdb;pdb.set_trace()
from transformers import AdamW
optimizer = AdamW(model.parameters(), lr=1e-5)
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
            {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name_or_path, num_labels=n_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name_or_path)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name_or_path, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

"""# Getting and training model"""

train_dataset = MovieReviewsDataset(use_tokenizer = tokenizer, labels = label_train, texts = text_train)
valid_dataset = MovieReviewsDataset(use_tokenizer = tokenizer, labels = label_test, texts = text_test)
Esempio n. 10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--train',
                        type=str,
                        default='data/disaster_response_messages_training.csv')
    parser.add_argument('--test',
                        type=str,
                        default='data/disaster_response_messages_test.csv')
    parser.add_argument(
        '--validation',
        type=str,
        default='data/disaster_response_messages_validation.csv')
    parser.add_argument('--epoch', type=str, default='10')
    parser.add_argument('--model',
                        type=str,
                        default='bert',
                        choices=['bert', 'bart', 'gpt2', 'roberta', 'xlnet'])
    args = parser.parse_args()

    EPOCH = int(args.epoch)
    model_name = args.model

    # create data loader for training and validation
    if model_name == 'bert':
        train_set = BertDataset(args.train)
        val_set = BertDataset(args.validation)
        test_set = BertDataset(args.test)
    elif model_name == 'bart':
        train_set = BartDataset(args.train)
        val_set = BartDataset(args.validation)
        test_set = BartDataset(args.test)
    elif model_name == 'gpt2':
        train_set = GPT2Dataset(args.train)
        val_set = GPT2Dataset(args.validation)
        test_set = GPT2Dataset(args.test)
    elif model_name == 'roberta':
        train_set = RobertaDataset(args.train)
        val_set = RobertaDataset(args.validation)
        test_set = RobertaDataset(args.test)
    elif model_name == 'xlnet':
        train_set = XLNetDataset(args.train)
        val_set = XLNetDataset(args.validation)
        test_set = XLNetDataset(args.test)

    train_loader = DataLoader(train_set, batch_size=20, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=20, shuffle=False)
    test_loader = DataLoader(test_set, batch_size=20, shuffle=False)

    print('Data Loaded.')

    if model_name == 'bert':
        model = BertForSequenceClassification.from_pretrained(
            'bert-base-uncased', num_labels=2)
    elif model_name == 'gpt2':
        model = GPT2ForSequenceClassification.from_pretrained('gpt2',
                                                              num_labels=2)
        model.config.pad_token_id = model.config.eos_token_id
    elif model_name == 'bart':
        model = BartForSequenceClassification.from_pretrained(
            'facebook/bart-base', num_labels=2)
    elif model_name == 'roberta':
        model = RobertaForSequenceClassification.from_pretrained(
            'roberta-base', num_labels=2)
    elif model_name == 'xlnet':
        model = XLNetForSequenceClassification.from_pretrained(
            'xlnet-base-cased', num_labels=2)

    optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
    total_steps = len(train_loader) * EPOCH
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)
    criterion = nn.CrossEntropyLoss()

    print('\nModel: ', model_name, '\tEpochs: ', EPOCH)

    epoch_loss = []
    epoch_val_acc = []

    for epoch in range(EPOCH):
        tqdm.write('Epoch: {}'.format(epoch + 1))
        loss = train(model, train_loader, criterion, optimizer, scheduler)
        epoch_loss.append(loss)
        val_acc = val(model, val_loader)
        epoch_val_acc.append(val_acc)

    torch.save(model, model_name + '/' + model_name + '_model.pt')

    # model = torch.load(model_name+'_model.pt')

    tqdm.write('\nFinal test...')
    test_result = test(model, test_loader)

    with open(model_name + '/' + model_name + '_loss.p', 'wb') as f:
        pickle.dump(epoch_loss, f)
    with open(model_name + '/' + model_name + '_val_accuracy.p', 'wb') as f:
        pickle.dump(epoch_val_acc, f)
    with open(model_name + '/' + model_name + '_test_result.p', 'wb') as f:
        pickle.dump(test_result, f)
Esempio n. 11
0
                          labels=targets)
        outputs, loss = outputs["logits"], outputs["loss"]
        test_acc += accuracy_(outputs, targets).item()
print(f"Test Acc: {test_acc / len(test_loader)}")

# как мы и ожидали, получили тот же результат, но процесс тренировки более гладкий (эпох можно было выбрать 4, на 5 уже оверфитится)

# ## Fine-tuning
#
# Теперь другой подход: загрузим модель, которая обучалась решать задачу Language Modeling. Посмотрим, получим ли мы прирост в качестве.

# In[7]:

model_1 = GPT2ForSequenceClassification.from_pretrained(
    "distilgpt2",
    output_attentions=True,
    pad_token_id=tokenizer.eos_token_id,
    num_labels=8).to(device)

# In[13]:

from transformers import AdamW, get_linear_schedule_with_warmup
# lr = 1e-5 # Предполагаемый learning rate. Он может быть больше или меньше :)

criterion = nn.CrossEntropyLoss(
)  # в принципе лишнее, внутри GPT2ForSequenceClassification уже он есть
optimizer = AdamW(model_1.parameters(), lr=2e-5, eps=1e-8)

num_epochs = 10
total_steps = len(train_loader) * num_epochs
scheduler = get_linear_schedule_with_warmup(
Esempio n. 12
0
tokenizer = GPT2Tokenizer.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path)

# Creating the tokenizer is pretty standard when using the Transformers library.
# After creating the tokenizer it is critical for this tutorial to set padding to the left tokenizer.padding_side = "left"
# and initialize the padding token to tokenizer.eos_token which is the GPT2's original end of sequence token.
# This is the most essential part of this tutorial since GPT2 uses the last token for prediction so we need to pad to the left.
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token

# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=model_name_or_path,
    config=model_config,
    cache_dir="/home/jovyan/data-vol-1/gpt2/models")

print(
    f"number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}"
)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`' % device)