Esempio n. 1
0
 def create_datasets(self):
     field = Field(tokenize=list)
     train, val, test = WikiText2.splits(field, root='wikitext2_data')
     field.build_vocab(train, vectors=None)
     trains, vals, _ = BPTTIterator.splits((train, val, test),
                                           batch_size=self.args.batch,
                                           bptt_len=self.args.bptt_len,
                                           device=torch.device('cpu'))
     return trains, vals
Esempio n. 2
0
def WikiTexts(batch_size=32, bptt=30, vectors="glove.6B.100d"):
    my_tok = spacy.load('en')
    #my_tok.tokenizer.add_special_case('<eos>', [{ORTH: '<eos>'}])
    #my_tok.tokenizer.add_special_case('<bos>', [{ORTH: '<bos>'}])
    #my_tok.tokenizer.add_special_case('<unk>', [{ORTH: '<unk>'}])
    TEXT = data.Field(lower=True, tokenize=spacy_tok)
    train, valid, test = WikiText2.splits(TEXT)
    TEXT.build_vocab(train, vectors=vectors)
    train_loader, val_loader, test_loader = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        bptt_len=bptt,  # this is where we specify the sequence length
        #device=(0 if USE_GPU else -1),
        repeat=False)

    return train_loader, val_loader, test_loader, TEXT
Esempio n. 3
0
    def get_data(self):
        '''
        Retrieves data in a format that can
        be used in training by loading in batches.

        Returns
        -------
            obj
                Object loaded with language data.
            obj
                Torchtext data iterator.
            int
                Vocab size in the text dataset.
            obj
                Field object from Torchtext.
            obj
                Vocabulary taken from Torchtext Field.
        '''
        TEXT = Field(tokenize=self.tokenizer, lower=True)

        train, valid, test = WikiText2.splits(TEXT)

        TEXT.build_vocab()
        vocab_size = len(TEXT.vocab)

        train_iter, valid_iter = BPTTIterator.splits(
            (train, valid),
            batch_size=self.config.batch_size,
            bptt_len=8,
            device=self.device,
            repeat=False)

        train_loader = Batch(dl=train_iter, x_var='text')
        valid_loader = Batch(dl=valid_iter, x_var='text')

        print(len(train_loader))

        data_dict = edict({
            'train_loader': train_loader,
            'valid_loader': valid_loader,
            'train_iter': train_iter,
            'vocab_size': vocab_size,
            'vocab': TEXT.vocab
        })

        return data_dict
def evaluate_lm(model_path):
    """
    Evaluate language model against Wiki2
    Arguments
    ---------
    model_path: string
        Can be "RNN", "QRNN"
    """


    device = "cuda" if torch.cuda.is_available() else "cpu"

    model, TEXT = load_model(model_path, device)


    train, valid, test = WikiText2.splits(TEXT)


    BATCH_SIZE = 32
    BPTT_LEN = 30

    train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=BATCH_SIZE,
        bptt_len=BPTT_LEN, # this is where we specify the sequence length
        device=device,
        repeat=False)

    criterion = nn.CrossEntropyLoss()

    model.eval()

    valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion)
    test_loss, test_perplexity = evaluate(model, test_iter, criterion)


    print(f"Valid loss      : {valid_loss:.3f}")
    print(f"Valid perplexity: {valid_perplexity:.2f}\n")

    print(f"Test loss      : {test_loss:.3f}")
    print(f"Test perplexity: {test_perplexity:.2f}")
def main(args):
    if args.device:
        device = args.device
    else:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    text_field = data.Field(tokenize=list)
    datasets = WikiText2.splits(text_field)
    text_field.build_vocab(datasets[0])

    train_iter, test_iter, val_iter = data.BPTTIterator.splits(datasets,
                                                               batch_size=32,
                                                               bptt_len=512,
                                                               device=device)

    vocab = text_field.vocab

    print(f'Vocab size: {len(vocab)}')

    model_args = dict(rnn_type='lstm',
                      ntoken=args.num_latents,
                      ninp=256,
                      nhid=1024,
                      nlayers=2)
    if args.model_args:
        model_args.update(dict(eval(args.model_args)))

    model = SHARNN(**model_args).to(device)
    model.train()

    criterion = nn.NLLLoss()

    #optim = torch.optim.SGD(model.parameters(), lr=5.0)
    optim = torch.optim.Adam(model.parameters(), lr=2e-3)

    for epoch in range(10):
        hidden = None
        mems = None

        total_loss = 0

        for step, batch in enumerate(train_iter):
            optim.zero_grad()

            if hidden is not None:
                hidden = repackage_hidden(hidden)
            if mems is not None:
                mems = repackage_hidden(mems)

            output, hidden, mems, attn_outs, _ = model(batch.text,
                                                       hidden,
                                                       return_h=True,
                                                       mems=mems)

            logits = model.decoder(output)
            logits = F.log_softmax(logits, dim=-1)

            assert logits.size(1) == batch.target.size(1)

            loss = criterion(logits.view(-1, logits.size(-1)),
                             batch.target.view(-1))
            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)

            optim.step()

            total_loss += loss.data

            if step % args.log_interval == 0 and step > 0:
                cur_loss = total_loss / args.log_interval
                print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:05.5f} | '
                      'loss {:5.2f} | ppl {:8.2f} | bpc {:8.3f}'.format(
                          epoch, step, len(train_iter),
                          optim.param_groups[0]['lr'], cur_loss,
                          math.exp(cur_loss), cur_loss / math.log(2)))
                total_loss = 0
Esempio n. 6
0
def segment(doc):
    """
    用 Spacy 库做分词, 将一段文档切割到若干词汇.
    """

    tokenizer = tokenize.tokenizer
    return [token.text for token in tokenizer(doc)]


# 定义特征域, 表示一段文本, 要求按规则分词并小写化预处理数据集.
TEXT = data.Field(lower=True, tokenize=segment)

# datasets 中存在一些准备好的数据集, 例如下面的 WikiText2, 另外这个
# 命令会在项目目录下自动创建目录 .data 并下载数据 (4.4M), 当然为了能
# 减少读者的疑惑, 在 data 文件夹下 copy 了一份相同的.
train_set, valid_set, test_set = WikiText2.splits(TEXT)

# 下面看看 train/valid/test 分别有多少条数据在其中 (没分词).
print(len(train_set), len(valid_set), len(test_set), end="\n\n")

# 在构建数据集的同时也可以加入预训练的词向量, 当然这里注释掉了.
TEXT.build_vocab(train_set)  # vectors="data/glove.6B.200d"

# 语言模型的核心便是 Iterator, 有子类为 BPTTIterator. 其特殊功能便
# 是将文本连续地切成一段段等长的序列并做 batch, 称为 bbpt, 例如:
#
#   "Machine learning is a field of computer science
#    that gives computers the ability to learn without
#    being explicitly programmed"
#
# 如果规定连续切割长度为 5, 则上述文本会生成一下列表:
Esempio n. 7
0
            loss = criterion(outs.view(-1, outs.size(-1)), targets.view(-1))
            epoch_loss += loss.item()
    return epoch_loss / len(devLoader)


###############################################################################
# Load data
###############################################################################
configfile = open('./config.yaml')
config = AttrDict(yaml.load(configfile, Loader=yaml.FullLoader))
device = torch.device(args.device)

# ? include lenghts
TEXT = Field(lower=True, include_lengths=False, batch_first=False)
# TEXT: split string into tokens
trainSet, devSet, testSet = WikiText2.splits(text_field=TEXT,
                                             root=config.data.data_root)
if config.model.rnn.pretrained_embedding:
    vec = torchtext.vocab.FastText(language='en',
                                   cache=config.data.fasttext_root)
    assert vec.dim == config.model.rnn.nemd
else:
    vec = None
TEXT.build_vocab(trainSet, vectors=vec)
# TEXT: numericalize, pad, add init_token and eos_token
trainLoader, devLoader, testLoader = BPTTIterator.splits(
    (trainSet, devSet, testSet),
    batch_size=config.data.BSZ,
    bptt_len=config.data.bptt_len,
    device=device)
assert len(TEXT.vocab) == config.data.vocabSize
Esempio n. 8
0
def train_lm(
    model_name, output_path, epochs=5, batch_size=32, bptt_len=35,
    lr=1e-3, optimizer="adam", min_freq=5, model_args={},
    scheduler_patience=5, scheduler_threshold=1e-4, early_stopping_tolerance=5):
    """
    Train and save a language model
    Arguments
    ---------
    model_name: string
        Can be "RNN", "QRNN"

    output_path: a path
        Where to save the model

    lr: float
        Learning rate, default = 1e-3

    model_args: dict
        Arguments to be passed to the createdmodel


    """

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


    TEXT = data.Field(
        tokenizer_language='en',
        lower=True,
        init_token='<sos>',
        eos_token='<eos>',
        batch_first=True,
    )


    train, valid, test = WikiText2.splits(TEXT)

    TEXT.build_vocab(train, min_freq=min_freq)

    print(f"We have {len(TEXT.vocab)} tokens in our vocabulary")

    device = "cuda" if torch.cuda.is_available() else "cpu"


    train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
        (train, valid, test),
        batch_size=batch_size,
        bptt_len=bptt_len, # this is where we specify the sequence length
        device=device,
        repeat=False
    )

    model = create_model(model_name, TEXT, model_args=model_args)
    if "awd" in model_name:
        optimizer = "asgd"
    optimizer = create_optimizer(model, optimizer, lr)
    criterion = nn.CrossEntropyLoss()

    print(f"Using LR Scheduler with patience {scheduler_patience} and threshold {scheduler_threshold}")
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, 'min', patience=scheduler_patience, threshold=scheduler_threshold
    )

    model = model.to(device)
    criterion = criterion.to(device)

    model_path = output_path

    training_cycle(
        epochs=epochs,
        model=model, train_iter=train_iter, valid_iter=valid_iter,
        optimizer=optimizer, criterion=criterion, scheduler=lr_scheduler,
        model_path=model_path, early_stopping_tolerance=early_stopping_tolerance
    )

    model.load_state_dict(torch.load(model_path))
    model.eval()

    valid_loss, valid_perplexity = evaluate(model, valid_iter, criterion)
    test_loss, test_perplexity = evaluate(model, test_iter, criterion)


    print(f"Valid loss      : {valid_loss:.2f}")
    print(f"Valid perplexity: {valid_perplexity:.2f}\n")

    print(f"Test loss      : {test_loss:.2f}")
    print(f"Test perplexity: {test_perplexity:.2f}")


    save_model(model, TEXT, output_path)
Esempio n. 9
0
import spacy

from spacy.symbols import ORTH


def spacy_tok(x):
    return [tok.lower() for tok in x]


TEXT = data.Field(lower=True, tokenize=spacy_tok)

from torchtext.datasets import WikiText2

train, valid, test = WikiText2.splits(
    TEXT
)  # loading custom datasets requires passing in the field, but nothing else.

TEXT.build_vocab(train, vectors="glove.6B.200d")
train_iter, valid_iter, test_iter = data.BPTTIterator.splits(
    (train, valid, test),
    batch_size=BATCH_SIZE,
    bptt_len=30,  # this is where we specify the sequence length
    device=(0 if USE_GPU else -1),
    repeat=False)

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable as V
Esempio n. 10
0
LABELS.build_vocab(train)


a = next(iter(data.BPTTIterator(train, 20, 20)))


train_iter, dev_iter, test_iter = data.BPTTIterator.splits(
    ([i.text for i in train], dev, test),
    bptt_len=13,
    batch_size=7,
    sort_key=lambda x: len(x.text),
    device='cpu')




# https://mlexplained.com/2018/02/15/language-modeling-tutorial-in-torchtext-practical-torchtext-part-2/
from torchtext.datasets import WikiText2
train, valid, test = WikiText2.splits(TEXT) # loading custom datas
len(train)


data.Example?







Esempio n. 11
0
from torch.optim import Adam
import torch
from nntoolbox.callbacks import *
from nntoolbox.metrics import *

MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

# train_iterator, val_iterator, test_iterator = WikiText2.iters()
# for tmp in train_iterator:
#     print(tmp)

train_data, val_data, test_data = WikiText2.splits(TEXT)
train_iterator = data.BPTTIterator(train_data,
                                   batch_size=BATCH_SIZE,
                                   sort_within_batch=True,
                                   device=get_device(),
                                   bptt_len=35,
                                   shuffle=True)

val_iterator = data.BPTTIterator(val_data,
                                 batch_size=BATCH_SIZE,
                                 sort_within_batch=True,
                                 device=get_device(),
                                 bptt_len=35,
                                 shuffle=True)

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d")