Beispiel #1
0
print(df.head())

print('Count of sample is {}'.format(len(df)))

train_df = df[:25000]
test_df = df[25000:]

train_df.to_csv('data/train_data.csv', index=False)
test_df.to_csv('data/test_data.csv', index=False)

from torchtext import data

# Declare field
TEXT = data.Field(sequential=True,
                  use_vocab=True,
                  tokenize=str.split,
                  lower=True,
                  batch_first=True,
                  fix_length=20)

LABEL = data.Field(sequential=False,
                   use_vocab=False,
                   batch_first=False,
                   is_target=True)

from torchtext.data import TabularDataset

train_data, test_data = TabularDataset.splits(path='.',
                                              train='data/train_data.csv',
                                              test='data/test_data.csv',
                                              format='csv',
                                              fields=[('text', TEXT),
Beispiel #2
0
    def test_init_when_nesting_field_is_not_sequential(self):
        nesting_field = data.Field(sequential=False)
        field = data.NestedField(nesting_field)

        assert field.pad_token == "<pad>"
Beispiel #3
0
    def test_numericalize(self):
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]
        numericalized = field.numericalize(examples_data)

        assert numericalized.dim() == 3
        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)

        # test include_lengths
        nesting_field = data.Field(batch_first=True)
        field = data.NestedField(nesting_field, include_lengths=True)
        ex1 = data.Example.fromlist(["john loves mary"], [("words", field)])
        ex2 = data.Example.fromlist(["mary cries"], [("words", field)])
        dataset = data.Dataset([ex1, ex2], [("words", field)])
        field.build_vocab(dataset)
        examples_data = [[
            ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
            ["<w>"] + list("john") + ["</w>", "<cpad>"],
            ["<w>"] + list("loves") + ["</w>"],
            ["<w>"] + list("mary") + ["</w>", "<cpad>"],
            ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
        ],
                         [
                             ["<w>", "<s>", "</w>"] + ["<cpad>"] * 4,
                             ["<w>"] + list("mary") + ["</w>", "<cpad>"],
                             ["<w>"] + list("cries") + ["</w>"],
                             ["<w>", "</s>", "</w>"] + ["<cpad>"] * 4,
                             ["<cpad>"] * 7,
                         ]]

        numericalized, seq_len, word_len = field.numericalize(
            (examples_data, [5, 4], [[3, 6, 7, 6, 3], [3, 6, 7, 3, 0]]))

        assert numericalized.dim() == 3
        assert len(seq_len) == 2
        assert len(word_len) == 2

        assert numericalized.size(0) == len(examples_data)
        for example, numericalized_example in zip(examples_data,
                                                  numericalized):
            verify_numericalized_example(field,
                                         example,
                                         numericalized_example,
                                         batch_first=True)
Beispiel #4
0
# load MR dataset
def mr(text_field, label_field, **kargs):
    train_data, dev_data = mydatasets.MR.splits(text_field, label_field)
    text_field.build_vocab(train_data, dev_data, min_freq=args.min_freq)
    label_field.build_vocab(train_data, dev_data)
    train_iter, dev_iter = data.Iterator.splits(
        (train_data, dev_data),
        batch_sizes=(args.batch_size, len(dev_data)),
        **kargs)
    return train_iter, dev_iter


# load data
print("\nLoading data...")
text_field = data.Field(lower=True)
# text_field = data.Field(lower=False)
label_field = data.Field(sequential=False)
static_text_field = data.Field(lower=True)
static_label_field = data.Field(sequential=False)
if args.FIVE_CLASS_TASK:
    print("Executing 5 Classification Task......")
    # train_iter, dev_iter, test_iter = mrs_five(args.datafile_path, args.name_trainfile,
    #                                            args.name_devfile, args.name_testfile, args.char_data, text_field, label_field, device=-1, repeat=False, shuffle=args.epochs_shuffle)
    if args.CNN_MUI is True or args.DEEP_CNN_MUI is True:
        train_iter, dev_iter, test_iter = mrs_five_mui(
            args.datafile_path,
            args.name_trainfile,
            args.name_devfile,
            args.name_testfile,
            args.char_data,
Beispiel #5
0
if __name__ == '__main__':
    # Set up
    # Set up 
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    DE = data.Field(tokenize=tokenize_de)
    EN = data.Field(tokenize=tokenize_en, init_token = BOS_WORD, eos_token = EOS_WORD) # only target needs BOS/EOS

    MAX_LEN = 20
    train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN), 
                                             filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
                                             len(vars(x)['trg']) <= MAX_LEN)


    MIN_FREQ = 5
    DE.build_vocab(train.src, min_freq=MIN_FREQ)
    EN.build_vocab(train.trg, min_freq=MIN_FREQ)
    print(DE.vocab.freqs.most_common(10))
    print("Size of German vocab", len(DE.vocab))
    print(EN.vocab.freqs.most_common(10))
    print("Size of English vocab", len(EN.vocab))
Beispiel #6
0
if True:
    import spacy
    spacy_de = spacy.load('de')
    spacy_en = spacy.load('en')

    def tokenize_de(text):
        return [tok.text for tok in spacy_de.tokenizer(text)]

    def tokenize_en(text):
        return [tok.text for tok in spacy_en.tokenizer(text)]

    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    SRC = data.Field(tokenize=tokenize_en, pad_token=BLANK_WORD)
    TGT = data.Field(tokenize=tokenize_de, init_token = BOS_WORD, 
                     eos_token = EOS_WORD, pad_token=BLANK_WORD)

    MAX_LEN = 100
    train, val, test = datasets.IWSLT.splits(
        exts=('.en', '.de'), fields=(SRC, TGT), 
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
            len(vars(x)['trg']) <= MAX_LEN)
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TGT.build_vocab(train.trg, min_freq=MIN_FREQ)
 
class MyIterator(data.Iterator):
    def create_batches(self):
        if self.train:
Beispiel #7
0
import codecs
import os
import torch
from subword_nmt.apply_bpe import BPE
from torchtext import data, datasets

import shared

BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"
MAX_LEN = 350
MIN_VOCAB_FREQ = 1

tokenizer_fun = lambda s: s.split()
SRC = data.Field(pad_token=BLANK_WORD, batch_first=True, tokenize=tokenizer_fun)
TGT = data.Field(init_token = BOS_WORD, eos_token = EOS_WORD, pad_token=BLANK_WORD, batch_first=True, tokenize=tokenizer_fun)

def load_dataset(src_lang: str, tgt_lang: str, min_length: int = 0, only_val: bool = False):
    print("Loading dataset...")
    if only_val:
        train = None
        test = None
        val = datasets.WMT14.splits(root=os.path.abspath(os.path.join(shared.DATA_FOLDER)),
                            exts=(f'.{src_lang}', f'.{tgt_lang}'), fields=(SRC, TGT),
                            train=None, test=None,
                            filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(vars(x)['trg']) <= MAX_LEN and
                                                  len(vars(x)['src']) >= min_length and len(vars(x)['trg']) >= min_length)[0]
    else:
        train, val, test = datasets.WMT14.splits(root=os.path.abspath(os.path.join(shared.DATA_FOLDER)),
                                exts=(f'.{src_lang}', f'.{tgt_lang}'), fields=(SRC, TGT),
Beispiel #8
0
import torch
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe

from cove import MTLSTM


parser = ArgumentParser()
parser.add_argument('--device', default=0, help='Which device to run one; -1 for CPU')
parser.add_argument('--data', default='.data', help='where to store data')
parser.add_argument('--embeddings', default='.embeddings', help='where to store embeddings')
args = parser.parse_args()

inputs = data.Field(lower=True, include_lengths=True, batch_first=True)

print('Generating train, dev, test splits')
train, dev, test = datasets.IWSLT.splits(root=args.data, exts=['.en', '.de'], fields=[inputs, inputs])
train_iter, dev_iter, test_iter = data.Iterator.splits(
            (train, dev, test), batch_size=100, device=torch.device(args.device) if args.device >= 0 else None)

print('Building vocabulary')
inputs.build_vocab(train, dev, test)
inputs.vocab.load_vectors(vectors=GloVe(name='840B', dim=300, cache=args.embeddings))

outputs_last_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, model_cache=args.embeddings)
outputs_both_layer_cove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, model_cache=args.embeddings)
outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=len(inputs.vocab), vectors=inputs.vocab.vectors, layer0=True, residual_embeddings=True, model_cache=args.embeddings)

if args.device >=0:
Beispiel #9
0
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)
torch.backends.cudnn.deterministic = True

if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print("Warning: You have Cuda but not use it. You are using CPU for training.")

################## Load the datasets ##################
TEXT = data.Field(lower=True)
ED = data.Field(sequential=False, use_vocab=False)
train, dev = data.TabularDataset.splits(path=args.output, train='entity_train.txt', validation='entity_valid.txt', format='tsv', fields=[('text', TEXT), ('mid', ED)])
field = [('id', None), ('sub', None), ('entity', None), ('relation', None), ('obj', None), ('text', TEXT), ('ed', None)]
test = data.TabularDataset(path=os.path.join(args.output, 'test.txt'), format='tsv', fields=field)
TEXT.build_vocab(train, dev, test)  # training data includes validation data


match_embedding = 0
TEXT.vocab.vectors = torch.Tensor(len(TEXT.vocab), words_dim)
for i, token in enumerate(TEXT.vocab.itos):
    wv_index = stoi.get(token, None)
    if wv_index is not None:
        TEXT.vocab.vectors[i] = vectors[wv_index]
        match_embedding += 1
    else:
torch.backends.cudnn.deterministic = True
if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("CUDA enabled")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print("CUDA is availabe but is not being used")
np.random.seed(args.seed)
random.seed(args.seed)

# Set up the data for training
# SST-1
if args.dataset == 'SST-1':
    TEXT = data.Field(batch_first=True, tokenize=clean_str_sst)
    LABEL = data.Field(sequential=False)
    train, dev, test = SST1Dataset.splits(TEXT, LABEL)
elif args.dataset == 'SST-2':
    TEXT = data.Field(batch_first=True)
    LABEL = data.Field(sequential=False)
    #train, dev, test = SST2Dataset.splitits(TEXT, LABEL)
    train, dev, test = torchtext.datasets.SST.splits(
        TEXT,
        LABEL,
        train_subtrees=True,
        filter_pred=lambda ex: ex.label != 'neutral')

elif args.dataset == 'trec':
    TEXT = data.Field(batch_first=True)
    LABEL = data.Field(sequential=False)
import torch
from torchtext import data
from torchtext import datasets
import random
from torchsummary import summary
SEED = 1234

torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

TEXT = data.Field(tokenize='spacy')
LABEL = data.LabelField(dtype=torch.float)

print("downloading data : ")
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)

train_data, valid_data = train_data.split(random_state=random.seed(SEED))

TEXT.build_vocab(train_data, max_size=5000)
LABEL.build_vocab(train_data)

BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size=BATCH_SIZE, 
    device=device)
Beispiel #12
0
from torchtext import data
from torchtext import datasets
from torchtext.vocab import GloVe, CharNGram


# Approach 1:
# set up fields
TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
LABEL = data.Field(sequential=False)


# make splits for data
train, test = datasets.TREC.splits(TEXT, LABEL, fine_grained=True)

# print information about the data
print('train.fields', train.fields)
print('len(train)', len(train))
print('vars(train[0])', vars(train[0]))

# build the vocabulary
TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300))
LABEL.build_vocab(train)

# print vocab information
print('len(TEXT.vocab)', len(TEXT.vocab))
print('TEXT.vocab.vectors.size()', TEXT.vocab.vectors.size())

# make iterator for splits
train_iter, test_iter = data.BucketIterator.splits(
    (train, test), batch_size=3, device=0)
Beispiel #13
0
    if not args.cuda:
        print(
            "WARNING: You have a CUDA device, so you should probably run with --cuda"
        )
    else:
        torch.cuda.manual_seed(args.seed)

############################
# Load data
############################
print("Loading data...")

PAD_WORD = '<blank>'
eval_batch_size = args.eval_batch_size

src = data.Field(pad_token=PAD_WORD)
trg = data.Field(pad_token=PAD_WORD)

train_data = datasets.TranslationDataset(path=args.data + '/train',
                                         exts=('.en', '.de'),
                                         fields=(src, trg))
val_data = datasets.TranslationDataset(path=args.data + '/valid',
                                       exts=('.en', '.de'),
                                       fields=(src, trg))
test_data = datasets.TranslationDataset(path=args.data + '/test',
                                        exts=('.en', '.de'),
                                        fields=(src, trg))

print("DONE\n")

############################
Beispiel #14
0
'''
https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/1%20-%20Simple%20Sentiment%20Analysis.ipynb
'''
import torch
import torch.nn as nn
from torch.autograd import Variable
from torchtext import data

# > The first difference is that we do not need to set the dtype in 
# the LABEL field. --
# https://github.com/bentrevett/pytorch-sentiment-analysis/blob/master/5%20-%20Multi-class%20Sentiment%20Analysis.ipynb
# Two classes: dtype=torch.float



TEXT = data.Field(sequential=True, include_lengths=False, batch_first=False)
LABELS = data.LabelField()
NAMES = data.RawField(is_target=False)

# Fields are added by column left to write in the underlying table
fields=[('name', NAMES), ('label', LABELS), ('text', TEXT)]

train, dev, test = data.TabularDataset.splits(
    path='tmp/processed', format='CSV', fields=fields,
    train='train.csv', validation='dev.csv', test='test.csv')

# https://github.com/pytorch/text/issues/641
train_iter, dev_iter, test_iter = data.BucketIterator.splits(
    (train, dev, test),
    batch_sizes=(100, 100, 100),
    sort_key=lambda x: len(x.text),
Beispiel #15
0
import torch
import torch.optim as O
import torch.nn as nn

from torchtext import data
from torchtext import datasets

from model import SNLIClassifier
from util import get_args, makedirs

args = get_args()
torch.cuda.set_device(args.gpu)
device = torch.device('cuda:{}'.format(args.gpu))

inputs = data.Field(lower=args.lower, tokenize='spacy')
answers = data.Field(sequential=False)

train, dev, test = datasets.SNLI.splits(inputs, answers)

inputs.build_vocab(train, dev, test)
if args.word_vectors:
    if os.path.isfile(args.vector_cache):
        inputs.vocab.vectors = torch.load(args.vector_cache)
    else:
        inputs.vocab.load_vectors(args.word_vectors)
        makedirs(os.path.dirname(args.vector_cache))
        torch.save(inputs.vocab.vectors, args.vector_cache)
answers.build_vocab(train)

train_iter, dev_iter, test_iter = data.BucketIterator.splits(
Beispiel #16
0
def tokenizer(text):  # create a tokenizer function
    # 返回 a list of <class 'spacy.tokens.token.Token'>
    return [tok.text for tok in spacy_en.tokenizer(text)]


from torchtext import data

import numpy as np
from data import text_utils

if __name__ == '__main__':
    args = argument_parser()
    with open("seq2seq/bak/TEXT.Field", "rb") as f:
        TEXT = dill.load(f)

    LENGTH = data.Field(sequential=False, use_vocab=False)

    embeddings = np.random.random((len(TEXT.vocab.itos), args.embed_size))
    args.TEXT = TEXT

    encoder = SN_MODELS["encoder"](embeddings, args)
    # atten = SN_MODELS["attention"](args.hidden_size * 4, 300)
    #decoder = SN_MODELS["decoder"](embeddings, args)
    atten = SN_MODELS["attention"](args.hidden_size, "general")
    decoder = SN_MODELS["decoder"](embeddings, args, atten)

    model_class = SN_MODELS[args.model_name]

    # model = model_class(encoder, decoder, args)
    model = model_class(encoder, decoder, args)
Beispiel #17
0
#     return [tok for tok in j_t.tokenize(text, wakati=True)]


def tokenizer(text):
    wakati = []
    node = tagger.parseToNode(text).next
    while node.next:
        wakati.append(node.surface)
        node = node.next
    return wakati


#Fieldクラス
TEXT = data.Field(sequential=True,
                  tokenize=tokenizer,
                  lower=True,
                  include_lengths=True,
                  batch_first=True)
LABEL = data.Field(sequential=False, use_vocab=True)
FILE = data.Field(sequential=False, use_vocab=True)

#データの読み込み
dataset = data.TabularDataset(path='./sentence.tsv',
                              format='tsv',
                              fields=[('Text', TEXT), ('Label', LABEL),
                                      ('File', FILE)],
                              skip_header=True)

LABEL.build_vocab(dataset)
FILE.build_vocab(dataset)
Beispiel #18
0
# Split dev dataset into test set and validation set
dev_set = pd.read_csv(args.dev_set)
validation_set, test_set = train_test_split(dev_set, test_size = args.test_size)

# Saving file names to variables
trainloc = args.train_set
valloc = args.save+'validation_set.csv'
testloc = args.save+'test_set.csv'

# Saving validation and test set to csv file
validation_set.to_csv(valloc, index=False)
test_set.to_csv(testloc, index=False)

# Create Field object
tokenize = lambda x: x.split()
TEXT = data.Field(tokenize=tokenize, lower=False, include_lengths = True, init_token = '<SOS>', eos_token = '<EOS>')
LEX = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')
BIO = data.Field(tokenize=tokenize, lower=False, init_token = '<SOS>', eos_token = '<SOS>')

# Specify Fields in the dataset
fields = [('context', TEXT), ('question', TEXT), ('bio', BIO), ('lex', LEX)]

# Build the dataset
train_data, valid_data, test_data = data.TabularDataset.splits(path = '',train=trainloc, validation=valloc,
															   test=testloc, fields = fields, format='csv', skip_header=True)

# Build vocabulary
MAX_VOCAB_SIZE = 50000
MIN_COUNT = 5
BATCH_SIZE = args.batch_size
Beispiel #19
0
    ('tfidf', TfidfTransformer()),
    ('clf', LogisticRegression( penalty='l2',
                                multi_class='auto',solver='saga',
                                max_iter=100, tol=1e-3)),
     ])

text_clf.fit(train['hypothesis'], train['label'])

predicted = text_clf.predict(test['hypothesis'])
print(np.mean(predicted == test['label']))

with open("tfidf.txt", 'w') as f:
    for idx in range(len(predicted)):
        f.write("{}\n".format(map_to_word(predicted[idx])))

TEXT = data.Field(tokenize = 'spacy', lower = True)
LABEL = data.LabelField()

train_data, valid_data, test_data = datasets.SNLI.splits(TEXT, LABEL)

MIN_FREQ = 2

TEXT.build_vocab(train_data, 
                 min_freq = MIN_FREQ,
                 vectors = "glove.6B.300d",
                 unk_init = torch.Tensor.normal_)

LABEL.build_vocab(train_data)

BATCH_SIZE = 256
Beispiel #20
0
def train_discriminator(
    dataset,
    dataset_fp=None,
    pretrained_model="gpt2-medium",
    epochs=10,
    batch_size=64,
    log_interval=10,
    save_model=False,
    cached=False,
    no_cuda=False,
):
    device = "cuda" if torch.cuda.is_available() and not no_cuda else "cpu"

    print("Preprocessing {} dataset...".format(dataset))
    start = time.time()

    if dataset == "SST":
        idx2class = [
            "positive", "negative", "very positive", "very negative", "neutral"
        ]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        text = torchtext_data.Field()
        label = torchtext_data.Field(sequential=False)
        train_data, val_data, test_data = datasets.SST.splits(
            text,
            label,
            fine_grained=True,
            train_subtrees=True,
        )

        x = []
        y = []
        for i in trange(len(train_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(train_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            x.append(seq)
            y.append(class2idx[vars(train_data[i])["label"]])
        train_dataset = Dataset(x, y)

        test_x = []
        test_y = []
        for i in trange(len(test_data), ascii=True):
            seq = TreebankWordDetokenizer().detokenize(
                vars(test_data[i])["text"])
            seq = discriminator.tokenizer.encode(seq)
            seq = torch.tensor([50256] + seq, device=device, dtype=torch.long)
            test_x.append(seq)
            test_y.append(class2idx[vars(test_data[i])["label"]])
        test_dataset = Dataset(test_x, test_y)

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 2,
        }

    elif dataset == "clickbait":
        idx2class = ["non_clickbait", "clickbait"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
            data = []
            for i, line in enumerate(f):
                try:
                    data.append(eval(line))
                except Exception:
                    print("Error evaluating line {}: {}".format(i, line))
                    continue
        x = []
        y = []
        with open("datasets/clickbait/clickbait_train_prefix.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        seq = torch.tensor([50256] + seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(d["label"])
                except Exception:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 1,
        }

    elif dataset == "toxic":
        idx2class = ["non_toxic", "toxic"]
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        x = []
        y = []
        with open("datasets/toxic/toxic_train.txt") as f:
            for i, line in enumerate(tqdm(f, ascii=True)):
                try:
                    d = eval(line)
                    seq = discriminator.tokenizer.encode(d["text"])

                    if len(seq) < max_length_seq:
                        seq = torch.tensor([50256] + seq,
                                           device=device,
                                           dtype=torch.long)
                    else:
                        print(
                            "Line {} is longer than maximum length {}".format(
                                i, max_length_seq))
                        continue
                    x.append(seq)
                    y.append(int(np.sum(d["label"]) > 0))
                except Exception:
                    print("Error evaluating / tokenizing"
                          " line {}, skipping it".format(i))
                    pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 0,
        }

    else:  # if dataset == "generic":
        # This assumes the input dataset is a TSV with the following structure:
        # class \t text

        if dataset_fp is None:
            raise ValueError("When generic dataset is selected, "
                             "dataset_fp needs to be specified aswell.")

        classes = set()
        with open(dataset_fp) as f:
            csv_reader = csv.reader(f, delimiter="\t")
            for row in tqdm(csv_reader, ascii=True):
                if row:
                    classes.add(row[0])

        idx2class = sorted(classes)
        class2idx = {c: i for i, c in enumerate(idx2class)}

        discriminator = Discriminator(class_size=len(idx2class),
                                      pretrained_model=pretrained_model,
                                      cached_mode=cached,
                                      device=device).to(device)

        x = []
        y = []
        with open(dataset_fp) as f:
            csv_reader = csv.reader(f, delimiter="\t")
            for i, row in enumerate(tqdm(csv_reader, ascii=True)):
                if row:
                    label = row[0]
                    text = row[1]

                    try:
                        seq = discriminator.tokenizer.encode(text)
                        if len(seq) < max_length_seq:
                            seq = torch.tensor([50256] + seq,
                                               device=device,
                                               dtype=torch.long)

                        else:
                            print("Line {} is longer than maximum length {}".
                                  format(i, max_length_seq))
                            continue

                        x.append(seq)
                        y.append(class2idx[label])

                    except Exception:
                        print(
                            "Error tokenizing line {}, skipping it".format(i))
                        pass

        full_dataset = Dataset(x, y)
        train_size = int(0.9 * len(full_dataset))
        test_size = len(full_dataset) - train_size
        train_dataset, test_dataset = torch.utils.data.random_split(
            full_dataset, [train_size, test_size])

        discriminator_meta = {
            "class_size": len(idx2class),
            "embed_size": discriminator.embed_size,
            "pretrained_model": pretrained_model,
            "class_vocab": class2idx,
            "default_class": 0,
        }

    end = time.time()
    print("Preprocessed {} data points".format(
        len(train_dataset) + len(test_dataset)))
    print("Data preprocessing took: {:.3f}s".format(end - start))

    if cached:
        print("Building representation cache...")

        start = time.time()

        train_loader = get_cached_data_loader(train_dataset,
                                              batch_size,
                                              discriminator,
                                              shuffle=True,
                                              device=device)

        test_loader = get_cached_data_loader(test_dataset,
                                             batch_size,
                                             discriminator,
                                             device=device)

        end = time.time()
        print("Building representation cache took: {:.3f}s".format(end -
                                                                   start))

    else:
        train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True,
                                                   collate_fn=collate_fn)
        test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
                                                  batch_size=batch_size,
                                                  collate_fn=collate_fn)

    if save_model:
        with open("{}_classifier_head_meta.json".format(dataset),
                  "w") as meta_file:
            json.dump(discriminator_meta, meta_file)

    optimizer = optim.Adam(discriminator.parameters(), lr=0.0001)

    for epoch in range(epochs):
        start = time.time()
        print("\nEpoch", epoch + 1)

        train_epoch(
            discriminator=discriminator,
            data_loader=train_loader,
            optimizer=optimizer,
            epoch=epoch,
            log_interval=log_interval,
            device=device,
        )
        evaluate_performance(data_loader=test_loader,
                             discriminator=discriminator,
                             device=device)

        end = time.time()
        print("Epoch took: {:.3f}s".format(end - start))

        print("\nExample prediction")
        predict(example_sentence,
                discriminator,
                idx2class,
                cached=cached,
                device=device)

        if save_model:
            # torch.save(discriminator.state_dict(),
            #           "{}_discriminator_{}.pt".format(
            #               args.dataset, epoch + 1
            #               ))
            torch.save(
                discriminator.get_classifier().state_dict(),
                "{}_classifier_head_epoch_{}.pt".format(dataset, epoch + 1),
            )
Beispiel #21
0
def main():
    # Use a GPU if available, as it should be faster.
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    student.device = device
    print("Using device: {}" "\n".format(str(device)))

    # Load the training dataset, and create a dataloader to generate a batch.
    textField = data.Field(lower=True,
                           include_lengths=True,
                           batch_first=True,
                           preprocessing=student.preprocessing,
                           postprocessing=student.postprocessing,
                           stop_words=student.stopWords)
    labelField = data.Field(sequential=False, use_vocab=False, is_target=True)

    dataset = data.TabularDataset('train.json', 'json', {
        'reviewText': ('reviewText', textField),
        'rating': ('rating', labelField)
    })

    textField.build_vocab(dataset, vectors=student.wordVectors)

    # Allow training on the entire dataset, or split it for training and validation.
    if student.trainValSplit == 1:
        trainLoader = data.BucketIterator(dataset,
                                          shuffle=True,
                                          batch_size=student.batchSize,
                                          sort_key=lambda x: len(x.reviewText),
                                          sort_within_batch=True)
    else:
        train, validate = dataset.split(split_ratio=student.trainValSplit,
                                        stratified=True,
                                        strata_field='rating')

        trainLoader, valLoader = data.BucketIterator.splits(
            (train, validate),
            shuffle=True,
            batch_size=student.batchSize,
            sort_key=lambda x: len(x.reviewText),
            sort_within_batch=True)

    # Get model and optimiser from student.
    net = student.net.to(device)
    criterion = student.lossFunc
    optimiser = student.optimiser

    # Train.
    for epoch in range(student.epochs):
        runningLoss = 0

        for i, batch in enumerate(trainLoader):
            # Get a batch and potentially send it to GPU memory.
            inputs = textField.vocab.vectors[batch.reviewText[0]].to(device)
            length = batch.reviewText[1].to(device)
            labels = batch.rating.type(torch.FloatTensor).to(device)

            # PyTorch calculates gradients by accumulating contributions
            # to them (useful for RNNs).
            # Hence we must manually set them to zero before calculating them.
            optimiser.zero_grad()

            # Forward pass through the network.
            output = net(inputs, length)
            loss = criterion(output, student.convertLabel(labels))

            # Calculate gradients.
            loss.backward()

            # Minimise the loss according to the gradient.
            optimiser.step()

            runningLoss += loss.item()

            if i % 32 == 31:
                print("Epoch: %2d, Batch: %4d, Loss: %.3f" %
                      (epoch + 1, i + 1, runningLoss / 32))
                runningLoss = 0

    # Save model.
    torch.save(net.state_dict(), 'savedModel.pth')
    print("\n" "Model saved to savedModel.pth")

    # Test on validation data if it exists.
    if student.trainValSplit != 1:
        net.eval()

        closeness = [0 for _ in range(5)]
        with torch.no_grad():
            for batch in valLoader:
                # Get a batch and potentially send it to GPU memory.
                inputs = textField.vocab.vectors[batch.reviewText[0]].to(
                    device)
                length = batch.reviewText[1].to(device)
                labels = batch.rating.type(torch.FloatTensor).to(device)

                # Convert network output to integer values.
                outputs = student.convertNetOutput(net(inputs,
                                                       length)).flatten()

                for i in range(5):
                    closeness[i] += torch.sum(abs(labels -
                                                  outputs) == i).item()

        accuracy = [x / len(validate) for x in closeness]
        score = 100 * (accuracy[0] + 0.4 * accuracy[1])

        print("\n"
              "Correct predictions: {:.2%}\n"
              "One star away: {:.2%}\n"
              "Two stars away: {:.2%}\n"
              "Three stars away: {:.2%}\n"
              "Four stars away: {:.2%}\n"
              "\n"
              "Weighted score: {:.2f}".format(*accuracy, score))
Beispiel #22
0
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

url = re.compile('(<url>.*</url>)')


def tokenize_de(text):
    return [tok.text for tok in spacy_de.tokenizer(url.sub('@URL@', text))]


def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(url.sub('@URL@', text))]


# Testing IWSLT
DE = data.Field(tokenize=tokenize_de)
EN = data.Field(tokenize=tokenize_en)

train, val, test = datasets.IWSLT.splits(exts=('.de', '.en'), fields=(DE, EN))

print(train.fields)
print(len(train))
print(vars(train[0]))
print(vars(train[100]))

DE.build_vocab(train.src, min_freq=3)
EN.build_vocab(train.trg, max_size=50000)

train_iter, val_iter = data.BucketIterator.splits((train, val), batch_size=3)

print(DE.vocab.freqs.most_common(10))
Beispiel #23
0
    def train_from_data(self, train_raw_data, test_raw_data, W, word2index,
                        args):

        self.word_embed_dim = W.shape[1]
        self.hidden_size = args.n_hidden
        self.vocab_size = len(W)
        self.output_size = 3

        if args.model == 'IOG':
            self.tagger = networks.IOG(self.word_embed_dim, self.output_size,
                                       self.vocab_size, args)
        else:
            print("model name not found")
            exit(-1)

        W = torch.from_numpy(W)
        self.tagger.word_rep.word_embed.weight = nn.Parameter(W)

        TEXT = data.Field(sequential=True,
                          use_vocab=False,
                          pad_token=0,
                          batch_first=True,
                          include_lengths=True)
        LABEL_T = data.Field(sequential=True,
                             use_vocab=False,
                             pad_token=0,
                             batch_first=True)
        LABEL_O = data.Field(sequential=True,
                             use_vocab=False,
                             pad_token=-1,
                             batch_first=True)
        LEFT_MASK = data.Field(sequential=True,
                               use_vocab=False,
                               pad_token=0,
                               batch_first=True)
        RIGHT_MASK = data.Field(sequential=True,
                                use_vocab=False,
                                pad_token=0,
                                batch_first=True)

        fields = [('text', TEXT), ('target', LABEL_T), ('label', LABEL_O),
                  ('left_mask', LEFT_MASK), ('right_mask', RIGHT_MASK)]

        if args.use_dev:
            train_texts, train_t, train_ow, dev_texts, dev_t, dev_ow = self.split_dev(
                *train_raw_data)
            dev_data = [[
                numericalize(text, word2index),
                numericalize_label(target, tag2id),
                numericalize_label(label, tag2id), *self.generate_mask(target)
            ] for text, target, label in zip(dev_texts, dev_t, dev_ow)]
            dev_dataset = ToweDataset(fields, dev_data)

        train_data = [[
            numericalize(text, word2index),
            numericalize_label(target, tag2id),
            numericalize_label(label, tag2id), *self.generate_mask(target)
        ] for text, target, label in zip(train_texts, train_t, train_ow)]
        test_data = [[
            numericalize(text, word2index),
            numericalize_label(target, tag2id),
            numericalize_label(label, tag2id), *self.generate_mask(target)
        ] for text, target, label in zip(*test_raw_data)]
        train_dataset = ToweDataset(fields, train_data)
        test_dataset = ToweDataset(fields, test_data)

        device = torch.device(
            "cuda" if torch.cuda.is_available() and cuda_flag else "cpu")
        n_gpu = torch.cuda.device_count()
        np.random.seed(args.seed)
        torch.manual_seed(args.seed)
        if n_gpu > 0:
            torch.cuda.manual_seed_all(args.seed)

        train_iter = data.Iterator(
            train_dataset,
            batch_size=args.batch_size,
            sort_within_batch=True,
            repeat=False,
            device=device if torch.cuda.is_available() else -1)
        if args.use_dev:
            dev_iter = data.Iterator(
                dev_dataset,
                batch_size=args.eval_bs,
                shuffle=False,
                sort_within_batch=True,
                repeat=False,
                device=device if torch.cuda.is_available() else -1)
        else:
            dev_iter = None
        test_iter = data.Iterator(
            test_dataset,
            batch_size=args.eval_bs,
            shuffle=False,
            sort_within_batch=True,
            repeat=False,
            device=device if torch.cuda.is_available() else -1)
        train.train(self.tagger, train_iter, dev_iter, test_iter, args=args)
        pass
def main(config):

    if not os.path.exists(config.model_dir):
        os.makedirs(config.model_dir)

    if not os.path.exists(config.log_dir):
        os.makedirs(config.log_dir)

    print("\t \t \t the model name is {}".format(config.model_name))
    device, n_gpu = get_device()

    torch.manual_seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.seed)
        torch.backends.cudnn.deterministic = True  # cudnn 使用确定性算法,保证每次结果一样
    """ sst2 数据准备 """
    CHAR_NESTING = data.Field(tokenize=list, lower=True)
    char_field = data.NestedField(CHAR_NESTING,
                                  tokenize='spacy',
                                  fix_length=config.sequence_length)
    word_field = data.Field(tokenize='spacy',
                            lower=True,
                            include_lengths=True,
                            fix_length=config.sequence_length)
    label_field = data.LabelField(dtype=torch.long)

    train_iterator, dev_iterator, test_iterator = sst_word_char(
        config.data_path, word_field, char_field, label_field,
        config.batch_size, device, config.glove_word_file,
        config.glove_char_file, config.cache_path)
    """ 词向量准备 """
    word_embeddings = word_field.vocab.vectors
    char_embeddings = char_field.vocab.vectors

    model_file = config.model_dir + 'model1.pt'
    """ 模型准备 """
    if config.model_name == "TextRNNHighway":
        from TextRNNHighway import TextRNNHighway
        model = TextRNNHighway.TextRNNHighway(
            config.glove_word_dim, config.glove_char_dim, config.output_dim,
            config.hidden_size, config.num_layers, config.bidirectional,
            config.dropout, word_embeddings, char_embeddings,
            config.highway_layers)
    elif config.model_name == "TextCNNHighway":
        from TextCNNHighway import TextCNNHighway
        filter_sizes = [int(val) for val in config.filter_sizes.split()]
        model = TextCNNHighway.TextCNNHighway(
            config.glove_word_dim, config.glove_char_dim, config.filter_num,
            filter_sizes, config.output_dim, config.dropout, word_embeddings,
            char_embeddings, config.highway_layers)
    elif config.model_name == "LSTMATTHighway":
        from LSTMATTHighway import LSTMATTHighway
        model = LSTMATTHighway.LSTMATTHighway(
            config.glove_word_dim, config.glove_char_dim, config.output_dim,
            config.hidden_size, config.num_layers, config.bidirectional,
            config.dropout, word_embeddings, char_embeddings,
            config.highway_layers)
    elif config.model_name == "TextRCNNHighway":
        from TextRCNNHighway import TextRCNNHighway
        model = TextRCNNHighway.TextRCNNHighway(
            config.glove_word_dim, config.glove_char_dim, config.output_dim,
            config.hidden_size, config.num_layers, config.bidirectional,
            config.dropout, word_embeddings, char_embeddings,
            config.highway_layers)

    optimizer = optim.Adam(model.parameters())
    criterion = nn.CrossEntropyLoss()

    model = model.to(device)
    criterion = criterion.to(device)

    if config.do_train:

        train(config.epoch_num, model, train_iterator, dev_iterator, optimizer,
              criterion, ['0', '1'], model_file, config.log_dir,
              config.print_step, 'highway')

    model.load_state_dict(torch.load(model_file))
    criterion = nn.CrossEntropyLoss()

    test_loss, test_acc, test_report = evaluate(model, test_iterator,
                                                criterion, ['0', '1'],
                                                'highway')
    print("-------------- Test -------------")
    print(
        "\t Loss: {} | Acc: {} | Micro avg F1: {} | Macro avg F1: {} | Weighted avg F1: {}"
        .format(test_loss, test_acc, test_report['micro avg']['f1-score'],
                test_report['macro avg']['f1-score'],
                test_report['weighted avg']['f1-score']))
Beispiel #25
0
    def test_build_vocab(self):
        # Set up fields
        question_field = data.Field(sequential=True)
        label_field = data.Field(sequential=False)

        # Write TSV dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="tsv")
        tsv_fields = [("id", None), ("q1", question_field),
                      ("q2", question_field), ("label", label_field)]
        tsv_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                          format="tsv",
                                          fields=tsv_fields)

        # Write JSON dataset and construct a Dataset
        self.write_test_ppid_dataset(data_format="json")
        json_fields = {
            "question1": ("q1", question_field),
            "question2": ("q2", question_field),
            "label": ("label", label_field)
        }
        json_dataset = data.TabularDataset(path=self.test_ppid_dataset_path,
                                           format="json",
                                           fields=json_fields)

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset,
                                   json_dataset,
                                   specials=['<space>'])
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            '<space>': 2,
            'Lincoln': 3,
            'When': 4,
            'born?': 5,
            'do': 6,
            'instead': 7,
            'of': 8,
            'use': 9,
            'was': 10,
            'you': 11,
            '"&"': 12,
            '"and"?': 13,
            '2+2': 14,
            '2+2=?': 15,
            'Abraham': 16,
            'What': 17,
            'Where': 18,
            'Which': 19,
            'is': 20,
            'location': 21,
            'し?': 22,
            'シ': 23
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab default
        question_field.build_vocab(tsv_dataset, json_dataset)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9,
            'you': 10,
            '"&"': 11,
            '"and"?': 12,
            '2+2': 13,
            '2+2=?': 14,
            'Abraham': 15,
            'What': 16,
            'Where': 17,
            'Which': 18,
            'is': 19,
            'location': 20,
            'し?': 21,
            'シ': 22
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos

        label_field.build_vocab(tsv_dataset, json_dataset)
        assert label_field.vocab.freqs == Counter({'1': 4, '0': 2})
        expected_stoi = {'1': 1, '0': 2, '<unk>': 0}
        assert dict(label_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert label_field.vocab.itos == expected_itos

        # Test build_vocab with extra kwargs passed to Vocab
        question_field.build_vocab(tsv_dataset,
                                   json_dataset,
                                   max_size=8,
                                   min_freq=3)
        assert question_field.vocab.freqs == Counter({
            'When': 4,
            'do': 4,
            'you': 4,
            'use': 4,
            'instead': 4,
            'of': 4,
            'was': 4,
            'Lincoln': 4,
            'born?': 4,
            'シ': 2,
            'し?': 2,
            'Where': 2,
            'What': 2,
            'is': 2,
            '2+2': 2,
            '"&"': 2,
            '"and"?': 2,
            'Which': 2,
            'location': 2,
            'Abraham': 2,
            '2+2=?': 2
        })
        expected_stoi = {
            '<unk>': 0,
            '<pad>': 1,
            'Lincoln': 2,
            'When': 3,
            'born?': 4,
            'do': 5,
            'instead': 6,
            'of': 7,
            'use': 8,
            'was': 9
        }
        assert dict(question_field.vocab.stoi) == expected_stoi
        # Turn the stoi dictionary into an itos list
        expected_itos = [
            x[0] for x in sorted(expected_stoi.items(), key=lambda tup: tup[1])
        ]
        assert question_field.vocab.itos == expected_itos
Beispiel #26
0
def load_data(
    data_cfg: dict
) -> (Dataset, Dataset, Optional[Dataset], Vocabulary, Vocabulary):
    """
    Load train, dev and optionally test data as specified in configuration.
    Vocabularies are created from the training set with a limit of `voc_limit`
    tokens and a minimum token frequency of `voc_min_freq`
    (specified in the configuration dictionary).

    The training data is filtered to include sentences up to `max_sent_length`
    on source and target side.

    :param data_cfg: configuration dictionary for data
        ("data" part of configuation file)
    :return:
        - train_data: training dataset
        - dev_data: development dataset
        - test_data: testdata set if given, otherwise None
        - src_vocab: source vocabulary extracted from training data
        - trg_vocab: target vocabulary extracted from training data
    """
    # load data from files
    src_lang = data_cfg["src"]
    trg_lang = data_cfg["trg"]
    train_path = data_cfg["train"]
    dev_path = data_cfg["dev"]
    test_path = data_cfg.get("test", None)
    level = data_cfg["level"]
    lowercase = data_cfg["lowercase"]
    max_sent_length = data_cfg["max_sent_length"]

    tok_fun = lambda s: list(s) if level == "char" else s.split()

    src_field = data.Field(init_token=None,
                           eos_token=EOS_TOKEN,
                           pad_token=PAD_TOKEN,
                           tokenize=tok_fun,
                           batch_first=True,
                           lower=lowercase,
                           unk_token=UNK_TOKEN,
                           include_lengths=True)

    trg_field = data.Field(init_token=BOS_TOKEN,
                           eos_token=EOS_TOKEN,
                           pad_token=PAD_TOKEN,
                           tokenize=tok_fun,
                           unk_token=UNK_TOKEN,
                           batch_first=True,
                           lower=lowercase,
                           include_lengths=True)

    train_data = TranslationDataset(
        path=train_path,
        exts=("." + src_lang, "." + trg_lang),
        fields=(src_field, trg_field),
        filter_pred=lambda x: len(vars(x)['src']) <= max_sent_length and len(
            vars(x)['trg']) <= max_sent_length)

    src_max_size = data_cfg.get("src_voc_limit", sys.maxsize)
    src_min_freq = data_cfg.get("src_voc_min_freq", 1)
    trg_max_size = data_cfg.get("trg_voc_limit", sys.maxsize)
    trg_min_freq = data_cfg.get("trg_voc_min_freq", 1)

    src_vocab_file = data_cfg.get("src_vocab", None)
    trg_vocab_file = data_cfg.get("trg_vocab", None)

    src_vocab = build_vocab(field="src",
                            min_freq=src_min_freq,
                            max_size=src_max_size,
                            dataset=train_data,
                            vocab_file=src_vocab_file)
    trg_vocab = build_vocab(field="trg",
                            min_freq=trg_min_freq,
                            max_size=trg_max_size,
                            dataset=train_data,
                            vocab_file=trg_vocab_file)
    dev_data = TranslationDataset(path=dev_path,
                                  exts=("." + src_lang, "." + trg_lang),
                                  fields=(src_field, trg_field))
    test_data = None
    if test_path is not None:
        # check if target exists
        if os.path.isfile(test_path + "." + trg_lang):
            test_data = TranslationDataset(path=test_path,
                                           exts=("." + src_lang,
                                                 "." + trg_lang),
                                           fields=(src_field, trg_field))
        else:
            # no target is given -> create dataset from src only
            test_data = MonoDataset(path=test_path,
                                    ext="." + src_lang,
                                    field=src_field)
    src_field.vocab = src_vocab
    trg_field.vocab = trg_vocab
    return train_data, dev_data, test_data, src_vocab, trg_vocab
Beispiel #27
0
    def test_pad(self):
        # Default case.
        field = data.Field()
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["a", "sentence", "of", "data", "."],
                                     [
                                         "yet", "another", "<pad>", "<pad>",
                                         "<pad>"
                                     ],
                                     ["one", "last", "sent", "<pad>", "<pad>"]]
        expected_lengths = [5, 2, 3]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test fix_length properly truncates and pads.
        field = data.Field(fix_length=3)
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["a", "sentence", "of"],
                                     ["yet", "another", "<pad>"],
                                     ["one", "last", "sent"]]
        expected_lengths = [3, 2, 3]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(fix_length=3, include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)
        field = data.Field(fix_length=3, truncate_first=True)
        expected_padded_minibatch = [["of", "data", "."],
                                     ["yet", "another", "<pad>"],
                                     ["one", "last", "sent"]]
        assert field.pad(minibatch) == expected_padded_minibatch

        # Test init_token is properly handled.
        field = data.Field(fix_length=4, init_token="<bos>")
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [["<bos>", "a", "sentence", "of"],
                                     ["<bos>", "yet", "another", "<pad>"],
                                     ["<bos>", "one", "last", "sent"]]
        expected_lengths = [4, 3, 4]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(fix_length=4,
                           init_token="<bos>",
                           include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test init_token and eos_token are properly handled.
        field = data.Field(init_token="<bos>", eos_token="<eos>")
        minibatch = [["a", "sentence", "of", "data", "."], ["yet", "another"],
                     ["one", "last", "sent"]]
        expected_padded_minibatch = [
            ["<bos>", "a", "sentence", "of", "data", ".", "<eos>"],
            ["<bos>", "yet", "another", "<eos>", "<pad>", "<pad>", "<pad>"],
            ["<bos>", "one", "last", "sent", "<eos>", "<pad>", "<pad>"]
        ]
        expected_lengths = [7, 4, 5]
        assert field.pad(minibatch) == expected_padded_minibatch
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           include_lengths=True)
        assert field.pad(minibatch) == (expected_padded_minibatch,
                                        expected_lengths)

        # Test that non-sequential data is properly handled.
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           sequential=False)
        minibatch = [["contradiction"], ["neutral"], ["entailment"]]
        assert field.pad(minibatch) == minibatch
        field = data.Field(init_token="<bos>",
                           eos_token="<eos>",
                           sequential=False,
                           include_lengths=True)
        assert field.pad(minibatch) == minibatch
Beispiel #28
0
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
spacy_en = spacy.load('en')


# create a tokenizer function
def tokenizer(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]


"""
field在默认的情况下都期望一个输入是一组单词的序列,并且将单词映射成整数。
这个映射被称为vocab。如果一个field已经被数字化了并且不需要被序列化,
可以将参数设置为use_vocab=False以及sequential=False。
"""
LABEL = data.Field(sequential=False, use_vocab=False)
TEXT = data.Field(sequential=True, use_vocab=tokenizer, lower=True)

# 定义Dataset
# 对于csv/tsv类型的文件,TabularDataset很容易进行处理,故我们选它来生成Dataset
"""
我们不需要 'PhraseId' 和 'SentenceId'这两列, 所以我们给他们的field传递 None
如果你的数据有列名,如我们这里的'Phrase','Sentiment',...
设置skip_header=True,不然它会把列名也当一个数据处理
"""
train, val = data.TabularDataset.splits(path='E:\\ML_data\\torchText\\',
                                        train='train.csv',
                                        validation='val.csv',
                                        format='csv',
                                        skip_header=True,
                                        fields=[('PhraseId', None),
Beispiel #29
0
    def __init__(
        self, train_fn,
        batch_size=64,
        valid_ratio=.1,
        device=-1,
        max_vocab=999999,
        min_freq=1,
        use_eos=False,
        shuffle=True,
    ):
        '''
        DataLoader initialization.
        :param train_fn: Train-set filename
        :param batch_size: Batchify data fot certain batch size.
        :param device: Device-id to load data (-1 for CPU)
        :param max_vocab: Maximum vocabulary size
        :param min_freq: Minimum frequency for loaded word.
        :param use_eos: If it is True, put <EOS> after every end of sentence.
        :param shuffle: If it is True, random shuffle the input data.
        '''
        super().__init__()

        # Define field of the input file.
        # The input file consists of two fields.
        self.label = data.Field(
            sequential=False,
            use_vocab=True,
            unk_token=None
        )
        self.text = data.Field(
            use_vocab=True,
            batch_first=True,
            include_lengths=False,
            eos_token='<EOS>' if use_eos else None,
        )

        # Those defined two columns will be delimited by TAB.
        # Thus, we use TabularDataset to load two columns in the input file.
        # We would have two separate input file: train_fn, valid_fn
        # Files consist of two columns: label field and text field.
        train, valid = data.TabularDataset(
            path=train_fn,
            format='tsv', 
            fields=[
                ('label', self.label),
                ('text', self.text),
            ],
        ).split(split_ratio=(1 - valid_ratio))

        # Those loaded dataset would be feeded into each iterator:
        # train iterator and valid iterator.
        # We sort input sentences by length, to group similar lengths.
        self.train_loader, self.valid_loader = data.BucketIterator.splits(
            (train, valid),
            batch_size=batch_size,
            device='cuda:%d' % device if device >= 0 else 'cpu',
            shuffle=shuffle,
            sort_key=lambda x: len(x.text),
            sort_within_batch=True,
        )

        # At last, we make a vocabulary for label and text field.
        # It is making mapping table between words and indice.
        self.label.build_vocab(train)
        self.text.build_vocab(train, max_size=max_vocab, min_freq=min_freq)
Beispiel #30
0
torch.manual_seed(args.seed)
np.random.seed(args.seed)
random.seed(args.seed)

if not args.cuda:
    args.gpu = -1
if torch.cuda.is_available() and args.cuda:
    print("Note: You are using GPU for training")
    torch.cuda.set_device(args.gpu)
    torch.cuda.manual_seed(args.seed)
if torch.cuda.is_available() and not args.cuda:
    print(
        "Warning: You have Cuda but not use it. You are using CPU for training."
    )

TEXT = data.Field(lower=True)
RELATION = data.Field(sequential=False)

train, dev, test = SQdataset.splits(TEXT, RELATION, args.data_dir)
TEXT.build_vocab(train, dev, test)
RELATION.build_vocab(train, dev)

train_iter = data.Iterator(train,
                           batch_size=args.batch_size,
                           device=args.gpu,
                           train=True,
                           repeat=False,
                           sort=False,
                           shuffle=True)
dev_iter = data.Iterator(dev,
                         batch_size=args.batch_size,