Esempio n. 1
0
def save_imdb_to_tsv():
    TEXT = data.Field(lower=True, include_lengths=True, batch_first=True)
    LABEL = data.Field(sequential=False, unk_token=False)
    train, val = IMDB.splits(TEXT, LABEL)

    str2label = {
        'negative': '0',
        'positive': '1',
    }
    test = val.examples
    dev = train.examples[-len(test):]
    train = train.examples[:-len(test)]

    def save_to_tsv(examples, fname):
        with open(fname, 'w') as f:
            f.write('sentence\tlabel\n')
            for e in examples:
                t, l = e.text, e.label
                t = ' '.join(t)
                l = str2label[l]
                f.write(f'{t}\t{l}\n')  # tsv format

    tsv_dir = 'data/imdb/fine-tune'
    save_to_tsv(train, os.path.join(tsv_dir, 'train.tsv'))
    save_to_tsv(dev, os.path.join(tsv_dir, 'dev.tsv'))
    save_to_tsv(test, os.path.join(tsv_dir, 'test.tsv'))
Esempio n. 2
0
def imdb(embedding=None):

    # make splits for data
    train, test = IMDB.splits(TEXT, LABEL)
    train, valid = train.split(random_state=random.seed(SEED))

    TEXT.build_vocab(
        train,
        vectors=embedding,
        specials=["<pad>", "<null>"],
        unk_init=torch.Tensor.normal_,
        max_size=25000,
    )
    TEXT.null_token = "<null>"

    # Need to build the vocab for the labels because they are `pos` and `neg`
    # This will convert them to numerical values
    LABEL.build_vocab(train)

    # make iterator for splits
    train_iter, valid_iter, test_iter = BucketIterator.splits(
        (train, valid, test),
        batch_size=64,
        sort_within_batch=True,
    )

    return train_iter, valid_iter, test_iter
Esempio n. 3
0
def test(config):
    device = 'cuda' if config['cuda'] else 'cpu'
    model = TextCNN.load(config['model_path']).to(device)
    with open(f"{config['text_vocab']}", "rb") as f:
        TEXT = dill.load(f)
    with open(f"{config['label_vocab']}", "rb") as f:
        LABEL = dill.load(f)
    _, test_data = IMDB.splits(TEXT, LABEL, root=config['data_path'])
    test_iter = torchtext.data.Iterator(test_data,
                                        batch_size=config['batch_size'],
                                        device=device)
    loss_fn = nn.CrossEntropyLoss(
        weight=torch.tensor(config['class_weight'], device=device))
    val_loss, accuracy = evaluate(model, test_iter, loss_fn)
    print(f"val_loss:{val_loss} - accuracy:{accuracy}")
Esempio n. 4
0
    def full_split(cls,
                   root_dir,
                   val_size=1000,
                   load_processed=True,
                   save_processed=True):
        '''Generates the full train/val/test split'''
        spd = os.path.join(root_dir, 'imdb', 'processed/')
        train_path = os.path.join(spd, 'train.pkl')
        val_path = os.path.join(spd, 'val.pkl')
        test_path = os.path.join(spd, 'test.pkl')
        if (load_processed and os.path.exists(train_path)
                and os.path.exists(val_path) and os.path.exists(test_path)):
            print(" [*] Loading pre-processed IMDB objects.")
            with open(train_path,
                      'rb') as train_f, open(val_path, 'rb') as val_f, open(
                          test_path, 'rb') as test_f:
                return pickle.load(train_f), pickle.load(val_f), pickle.load(
                    test_f)

        # This means we're not loading from pickle
        itrain, itest = IMDB.splits(RawField(), RawField(), root=root_dir)

        vocab = Vocabulary([x.text for x in itrain] + [x.text for x in itest],
                           f_min=100)

        # For val we take middle val_size values as this is where pos/neg switch occurs
        mid = len(itrain) // 2
        grab = val_size // 2
        train = cls([[x.text, x.label] for x in itrain[:mid - grab]] +
                    [[x.text, x.label] for x in itrain[mid + grab:]], vocab)
        val = cls([[x.text, x.label] for x in itrain[mid - grab:mid + grab]],
                  vocab)
        test = cls([[x.text, x.label] for x in itest], vocab)

        if save_processed:
            if not os.path.exists(spd):
                os.makedirs(spd)

            with open(train_path, 'wb') as f:
                pickle.dump(train, f)

            with open(val_path, 'wb') as f:
                pickle.dump(val, f)

            with open(test_path, 'wb') as f:
                pickle.dump(test, f)

        return train, val, test
    def prepare_data(self):
        self.text_field = Field(sequential=True,
                                fix_length=200,
                                include_lengths=True)
        self.label_field = LabelField()

        train_val, test = IMDB.splits(self.text_field, self.label_field)
        random.seed(42)
        train, val = train_val.split(random_state=random.getstate())

        self.text_field.build_vocab(
            train, vectors=GloVe())  #vectors=FastText('simple'))
        self.label_field.build_vocab(train)

        self.train_iter, self.test_iter, self.val_iter = BucketIterator.splits(
            (train, test, val), batch_size=self.batch_size)

        self.train_iter.sort_within_batch = True
        self.val_iter.sort_within_batch = True
Esempio n. 6
0
def process_sents():
    def insert_index(dataset: data.Dataset):
        examples = dataset.examples
        fields = dataset.fields
        for i, e in enumerate(examples):
            setattr(e, 'index', i)
        fields['index'] = data.Field(sequential=False, use_vocab=False)
        dataset.examples = examples
        dataset.fields = fields
        return dataset

    text = data.Field(lower=True, include_lengths=True)
    label = data.Field(sequential=False, is_target=True, use_vocab=False)
    train_data, test_data = IMDB.splits(text, label)
    train_data = insert_index(train_data)
    test_data = insert_index(test_data)

    # save data
    torch.save(train_data.examples, 'data/imdb/train.data')
    torch.save(test_data.examples, 'data/imdb/test.data')
    torch.save(train_data.fields, 'data/imdb/fields')
Esempio n. 7
0
    def load(self, split_ratio=None, random_state=None, verbose=False):
        if split_ratio is None:
            split_ratio = self.split_ratio
        assert (split_ratio <= 1)
        if random_state is None:
            random_state = self.random_state

        ## create field - tokenize text & create label classes
        self.TEXT = data.Field(tokenize='spacy')
        self.LABEL = data.LabelField(dtype=torch.float)

        # load dataset
        self.train_data, self.test_data = IMDB.splits(self.TEXT, self.LABEL)

        # split training into train & validation
        self.train_data, self.valid_data = self.train_data.split(
            split_ratio=split_ratio, random_state=random_state)
        if verbose:
            print('Training data size:   ', len(self.train_data))
            print('Validation data size: ', len(self.valid_data))
            print('Test data size:       ', len(self.test_data))
def get_data_loader(
        doc_processor:DocumentDataPreprocessor,\
        batch_size=3,\
        dataset_path ='data/IMDB/aclImdb/train',
        MAX_WORD_COUNT=1000,\
        MIN_DOC_THRESHOLD=300,\
        MIN_WORD_COUNT=0,
        num_samples=None
    ):
    text_preprocessing = None  #lambda x:mdl.model_processor(x)
    label_preprocessing = None  # lambda x:1 if 'pos' else 0
    TEXT = torchtext.data.RawField(preprocessing=text_preprocessing)
    LABEL = torchtext.data.RawField(is_target=True,
                                    preprocessing=label_preprocessing)
    dataset = IMDB(dataset_path, text_field=TEXT, label_field=LABEL)
    data_objects = [{
        'text': i.text,
        'label': i.label
    } for i in dataset.examples]
    df = pandas.DataFrame(data_objects)
    df['training_content'] = df.apply(
        lambda row: doc_processor.formatter(row['text']), axis=1)
    df = df[df['training_content'].str.split().str.len() <= MAX_WORD_COUNT]
    # Filtering post cleanup.
    df = df[df['training_content'].str.split().str.len() >= MIN_WORD_COUNT]
    if num_samples is not None:
        df = df.sample(n=num_samples)
    labels = df['label']
    training_content_df = df['training_content']
    tensor_dataset, column_split_order = doc_processor.prepare_dataset(
        training_content_df, labels, max_length=1024)
    dataloader = DataLoader(
        tensor_dataset,  # The training samples.
        sampler=RandomSampler(tensor_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )
    return dataloader, column_split_order
Esempio n. 9
0
from torchtext.vocab import GloVe,FastText,CharNGram
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch
from torchtext.datasets import IMDB
import sys
import time
from apex import amp

is_cuda = torch.cuda.is_available()

TEXT = data.Field(lower=True, fix_length=200, batch_first=False)
LABEL = data.Field(sequential=False,)

train, test = IMDB.splits(TEXT, LABEL)

TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300), max_size=10000, min_freq=10)
LABEL.build_vocab(train,)

train_iter, test_iter = data.BucketIterator.splits((train, test), batch_size=64)
train_iter.repeat = False
test_iter.repeat = False


class IMDBrnn(nn.Module):

    def __init__(self, vocab, hidden_size, n_cat, bs=1, nl=2):
        super().__init__()
        self.hidden_size = hidden_size
        self.bs = bs
Esempio n. 10
0
    output = F.softmax(output, dim=-1)
    print(output)


if __name__ == "__main__":
    text_field = Field(use_vocab=False,
                       tokenize=tokenize_and_trunc,
                       preprocessing=tokenizer.convert_tokens_to_ids,
                       batch_first=True,
                       init_token=init_token_idx,
                       eos_token=eos_token_idx,
                       pad_token=pad_token_idx,
                       unk_token=unk_token_idx)
    label_field = LabelField()

    train_data, test_data = IMDB.splits(text_field, label_field)
    train_data, valid_data = train_data.split()
    label_field.build_vocab(train_data)

    n_epochs = 5
    batch_size = 128
    rnn_hidden_size = 256
    dropout_p = 0.2
    num_classes = len(label_field.vocab)
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

    model = BertGRU(bert.config.to_dict()['dim'],
                    rnn_hidden_size, num_classes=num_classes,
                    dropout_p=dropout_p)

    for name, params in model.named_parameters():
Esempio n. 11
0
from torchtext.datasets import IMDB
from os.path import join, exists
from os import mkdir
from tqdm import tqdm
import pandas as pd

import torch
import torch.nn as nn

print("loading dataset...")
train_iter, test_iter = IMDB("datasets", split=('train', 'test'))


# tokenize
def tokenize(text):
    return [t.lower() for t in text.split()]


train_set = [(label, tokenize(line)) for label, line in tqdm(train_iter, desc="tokenizing trainset...")]
test_set = [(label, tokenize(line)) for label, line in tqdm(test_iter, desc="tokenizing testset...")]

# vocab
vocab = sorted(list(set(t for (_, tokens) in train_set for t in tokens)))

PADDING_IDX = 0
vocab.insert(PADDING_IDX, "<padding>")

UNKNOWN_IDX = 1
vocab.insert(UNKNOWN_IDX, "<unknown>")

token2idx = {token: idx for idx, token in enumerate(vocab)}
Esempio n. 12
0
def main():
    args = parse_arguments()
    use_cuda = torch.cuda.is_available()

    # visdom for plotting
    vis = Visdom()
    win_g, win_d, win_w = None, None, None
    assert vis.check_connection()

    # load datasets
    print("[!] preparing dataset...")
    TEXT = Field(lower=True, fix_length=args.seq_len,
                 tokenize=list, batch_first=True)
    LABEL = Field(sequential=False)
    train_data, test_data = IMDB.splits(TEXT, LABEL)
    TEXT.build_vocab(train_data)
    LABEL.build_vocab(train_data)
    train_iter, test_iter = BucketIterator.splits(
            (train_data, test_data), batch_size=args.batch_size, repeat=True)
    vocab_size = len(TEXT.vocab)
    print("[TRAIN]:%d (dataset:%d)\t[TEST]:%d (dataset:%d)\t[VOCAB]:%d"
          % (len(train_iter), len(train_iter.dataset),
             len(test_iter), len(test_iter.dataset), vocab_size))

    # instantiate models
    G = Generator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size)
    D = Discriminator(dim=512, seq_len=args.seq_len, vocab_size=vocab_size)
    optim_G = optim.Adam(G.parameters(), lr=args.lr, betas=(0.5, 0.9))
    optim_D = optim.Adam(D.parameters(), lr=args.lr, betas=(0.5, 0.9))

    global one, mone
    one = torch.FloatTensor([1])
    mone = one * -1
    if use_cuda:
        G, D = G.cuda(), D.cuda()
        one, mone = one.cuda(), mone.cuda()

    train_iter = iter(train_iter)
    batch_size = args.batch_size
    for b in range(1, args.batchs+1):
        # (1) Update D network
        for p in D.parameters():  # reset requires_grad
            p.requires_grad = True
        for iter_d in range(args.critic_iters):  # CRITIC_ITERS
            batch = next(train_iter)
            text, label = batch.text, batch.label
            text = to_onehot(text, vocab_size)
            if use_cuda:
                text = text.cuda()
            real = Variable(text)
            d_loss, wasserstein = train_discriminator(
                    D, G, optim_D, real, args.lamb, batch_size, use_cuda)
        # (2) Update G network
        for p in D.parameters():
            p.requires_grad = False  # to avoid computation
        g_loss = train_generator(D, G, optim_G, batch_size, use_cuda)

        # plot losses on visdom
        win_d = plot('Discriminator Loss', vis,
                     x=b, y=d_loss.data[0], win=win_d)
        win_g = plot('Generator Loss', vis,
                     x=b, y=g_loss.data[0], win=win_g)
        win_w = plot('Wasserstein Distance', vis,
                     x=b, y=wasserstein.data[0], win=win_w)

        if b % 500 == 0 and b > 1:
            samples = sample(G, TEXT, 1, args.seq_len, vocab_size, use_cuda)
            print("[%d] D:%5.2f G:%5.2f W:%5.2f \nsample:%s \t [%d]" %
                  (b, d_loss.data[0], g_loss.data[0], wasserstein.data[0],
                   samples[0], label.data[0]))
            log_sample("Sample %d" % b, vis, samples)
        if b % 5000 == 0 and b > 1:
            print("[!] saving model")
            if not os.path.isdir(".save"):
                os.makedirs(".save")
            torch.save(G.state_dict(), './.save/wgan_g_%d.pt' % (b))
            torch.save(D.state_dict(), './.save/wgan_d_%d.pt' % (b))
Esempio n. 13
0
from nntoolbox.sequence.utils import extract_last
from nntoolbox.components import MLP, ConcatPool
from functools import partial


MAX_VOCAB_SIZE = 25000
BATCH_SIZE = 16

TEXT = data.Field(tokenize='spacy', include_lengths=True, fix_length=500)
LABEL = data.LabelField(dtype=torch.float)
# train_data, val_data, test_data = SST.splits(
#     text_field=TEXT,
#     label_field=LABEL
# )

train_val_data, test_data = IMDB.splits(TEXT, LABEL)
train_data, val_data = train_val_data.split(split_ratio=0.8)

train_iterator, val_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, val_data, test_data),
    batch_size=BATCH_SIZE,
    sort_within_batch=True,
    device=get_device()
)

TEXT.build_vocab(train_data, max_size=MAX_VOCAB_SIZE, vectors="glove.6B.100d")
LABEL.build_vocab(train_data)

# max_length = 0
# for batch in train_iterator:
#     texts, text_lengths = batch.text
Esempio n. 14
0
def train(config):
    try:
        split = config["split"]
        data_path = config["data_path"]
        pretrained_model_dir = config["pretrained_model_dir"]
        pretrained_model_file = config["pretrained_model_file"]
        last_model_path = config["last_model_path"]
        save_to = config["save_to"]
        min_freq = config["min_freq"]
        batch_size = config["batch_size"]
        max_sent_length = config["max_sent_length"]
        embed_dim = config["embed_dim"]
        filter_num = config["filter_num"]
        filter_widths = config["filter_widths"]
        learning_rate = config["learning_rate"]
        patience = config["patience"]
        lr_decay = config["lr_decay"]
        max_num_trial = config["max_num_trial"]
        max_epoch = config["max_epoch"]
        save_every = config["save_every"]
        cuda = config["cuda"]
        debug = config["debug"]
    except KeyError:
        print("Input Parameter Error")
        exit(1)

    if not Path(save_to).exists():
        Path(save_to).mkdir()
    device = torch.device("cuda:0" if (
        torch.cuda.is_available() and cuda) else "cpu")

    # build torchtext field
    TEXT = torchtext.data.Field(tokenize='spacy', lower=True)
    LABEL = torchtext.data.Field(dtype=torch.long)

    train_data, test_data = IMDB.splits(TEXT, LABEL, root=data_path)
    if debug:
        train_data, val_data = train_data.split(split_ratio=0.1)
    train_data, val_data = train_data.split(split_ratio=0.7)
    train_iter, val_iter = torchtext.data.Iterator.splits(
        (train_data, val_data), batch_size=batch_size, device=device)

    if (pretrained_model_file is not None) and (pretrained_model_dir
                                                is not None):
        pretrained_vector = Vectors(name=pretrained_model_file,
                                    cache=pretrained_model_dir)

    TEXT.build_vocab(train_data, min_freq=min_freq, vectors=pretrained_vector)
    LABEL.build_vocab(train_data)

    logging.info("saving TEXT/LABEL vocabulary...")
    with open(f"{save_to}/TEXT_vocab.bin", "wb") as f:
        dill.dump(TEXT, f)
    with open(f"{save_to}/LABEL_vocab.bin", "wb") as f:
        dill.dump(LABEL, f)

    assert embed_dim == TEXT.vocab.vectors.shape[
        -1], "incompatiable embeddings"
    embed_num, class_num = len(TEXT.vocab), len(LABEL.vocab)

    model = TextCNN(embed_num,
                    embed_dim,
                    class_num,
                    filter_num,
                    filter_widths,
                    from_pretrained=TEXT.vocab.vectors).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    cross_entropy = nn.CrossEntropyLoss(weight=torch.tensor(
        [0, 0, 1.0, 1.0], device=device))  # class [<unk>,<pad>,'pos','neg']
    if last_model_path is not None:
        # load model
        logging.info(f'load model from  {last_model_path}')
        params = torch.load(last_model_path,
                            map_location=lambda storage, loc: storage)
        model.load_state_dict(params['state_dict'])
        logging.info('restore parameters of the optimizers')
        optimizer.load_state_dict(torch.load(last_model_path + '.optim'))

    model.train()

    epoch = 0
    cur_trial = 0
    hist_valid_scores = []
    train_time = begin_time = time.time()
    logging.info("begin training!")
    while True:
        epoch += 1
        train_loss = 0
        cum_cnt = 0
        step = 0
        for batch in iter(train_iter):
            feature, target = batch.text.T, batch.label.squeeze(0)
            step += 1
            optimizer.zero_grad()
            res = model(feature)
            loss = cross_entropy(res, target)
            train_loss += loss
            loss.backward()
            optimizer.step()
        train_loss = train_loss / step
        val_loss, accuracy = evaluate(model, val_iter, cross_entropy)

        logging.info(
            f'epoch {epoch}\t train_loss: {train_loss}\t val_loss:{val_loss}\t val_accuracy:{accuracy}  speed:{time.time()-train_time:.2f}s/epoch\t time elapsed {time.time()-begin_time:.2f}s'
        )
        train_time = time.time()

        is_better = len(
            hist_valid_scores) == 0 or val_loss < min(hist_valid_scores)
        hist_valid_scores.append(val_loss)

        if epoch % save_every == 0:
            model.save(f"{save_to}/model_step_{epoch}")
            torch.save(optimizer.state_dict(),
                       f"{save_to}/model_step_{epoch}.optim")
        if is_better:
            cur_patience = 0
            model_save_path = f"{save_to}/model_best"
            print(f'save currently the best model to [{model_save_path}]')
            model.save(model_save_path)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), model_save_path + '.optim')
        elif cur_patience < patience:
            cur_patience += 1
            print('hit patience %d' % cur_patience)

            if cur_patience == patience:
                cur_trial += 1
                print(f'hit #{cur_trial} trial')
                if cur_trial == max_num_trial:
                    print('early stop!')
                    exit(0)

                # decay lr, and restore from previously best checkpoint
                lr = optimizer.param_groups[0]['lr'] * lr_decay
                logging.info(
                    f'load previously best model and decay learning rate to {lr}'
                )

                # load model
                params = torch.load(model_save_path,
                                    map_location=lambda storage, loc: storage)
                model.load_state_dict(params['state_dict'])
                model = model.to(device)

                logging.info('restore parameters of the optimizers')
                optimizer.load_state_dict(
                    torch.load(model_save_path + '.optim'))

                # set new lr
                for param_group in optimizer.param_groups:
                    param_group['lr'] = lr

                # reset patience
                cur_patience = 0

        if epoch == max_epoch:
            print('reached maximum number of epochs!')
            exit(0)