def load_datasets(data_dir, real_dataset, fake_dataset, tokenizer, batch_size,
                  max_sequence_length, random_sequence_length, epoch_size=None, token_dropout=None, seed=None):

    real_corpus = Corpus(real_dataset, data_dir=data_dir)

    if fake_dataset == "TWO":
        real_train, real_valid = real_corpus.train * 2, real_corpus.valid * 2
        fake_corpora = [Corpus(name, data_dir=data_dir) for name in ['grover_fake', 'gpt2_fake']]
        fake_train = sum([corpus.train for corpus in fake_corpora], [])
        fake_valid = sum([corpus.valid for corpus in fake_corpora], [])

    else:
        fake_corpus = Corpus(fake_dataset, data_dir=data_dir)

        real_train, real_valid = real_corpus.train, real_corpus.valid
        fake_train, fake_valid = fake_corpus.train, fake_corpus.valid

    Sampler = DistributedSampler if distributed() and dist.get_world_size() > 1 else RandomSampler

    min_sequence_length = 10 if random_sequence_length else None
    train_dataset = EncodedDataset(real_train, fake_train, tokenizer, max_sequence_length, min_sequence_length,
                                   epoch_size, token_dropout, seed)
    train_loader = DataLoader(train_dataset, batch_size, sampler=Sampler(train_dataset), num_workers=0)

    validation_dataset = EncodedDataset(real_valid, fake_valid, tokenizer, max_sequence_length, min_sequence_length)
    validation_loader = DataLoader(validation_dataset, batch_size=1, sampler=Sampler(validation_dataset))

    return train_loader, validation_loader
Beispiel #2
0
def load_datasets(data_dir, real_dataset, fake_dataset, tokenizer,
                  max_sequence_length, random_sequence_length):

    real_corpus = Corpus(real_dataset, data_dir=data_dir, single_file=True)

    if fake_dataset == "TWO":
        real_train, real_valid = real_corpus.train * 2, real_corpus.valid * 2
        fake_corpora = [
            Corpus(name, data_dir=data_dir)
            for name in ['grover_fake', 'gpt2_fake']
        ]
        fake_train = sum([corpus.train for corpus in fake_corpora], [])
        fake_valid = sum([corpus.valid for corpus in fake_corpora], [])

    else:
        fake_corpus = Corpus(fake_dataset, data_dir=data_dir, single_file=True)

        real_valid = real_corpus.data
        fake_valid = fake_corpus.data

    min_sequence_length = 10 if random_sequence_length else None

    validation_dataset = EncodedDataset(real_valid, fake_valid, tokenizer,
                                        max_sequence_length,
                                        min_sequence_length)
    validation_loader = DataLoader(validation_dataset)

    return validation_loader
Beispiel #3
0
def load_datasets(data_dir,
                  real_dataset,
                  fake_dataset,
                  tokenizer,
                  batch_size,
                  max_sequence_length,
                  random_sequence_length,
                  token_dropout=None,
                  seed=None,
                  num_train_pairs=None,
                  num_workers=1):

    download(real_dataset, fake_dataset, data_dir=data_dir)

    real_corpus = Corpus(real_dataset, data_dir=data_dir)
    fake_corpus = Corpus(fake_dataset, data_dir=data_dir)

    real_train, real_valid, real_test = real_corpus.train, real_corpus.valid, real_corpus.test
    fake_train, fake_valid, fake_test = fake_corpus.train, fake_corpus.valid, fake_corpus.test

    if num_train_pairs:
        real_sample = np.random.choice(len(real_train), num_train_pairs)
        fake_sample = np.random.choice(len(fake_train), num_train_pairs)
        real_train = [real_train[i] for i in real_sample]
        fake_train = [fake_train[i] for i in fake_sample]

    sampler = SequentialSampler
    min_sequence_length = 10 if random_sequence_length else None
    train_dataset = GPT2EncodedDataset(real_train, fake_train, tokenizer,
                                       max_sequence_length,
                                       min_sequence_length, token_dropout,
                                       seed)
    train_loader = DataLoader(train_dataset,
                              batch_size,
                              sampler=SequentialSampler(train_dataset),
                              num_workers=num_workers)

    validation_dataset = GPT2EncodedDataset(real_valid, fake_valid, tokenizer,
                                            max_sequence_length,
                                            min_sequence_length)
    validation_loader = DataLoader(
        validation_dataset,
        batch_size=batch_size,
        sampler=SequentialSampler(validation_dataset),
        num_workers=num_workers)

    test_dataset = GPT2EncodedDataset(real_test, fake_test, tokenizer,
                                      max_sequence_length, min_sequence_length)
    test_loader = DataLoader(test_dataset,
                             batch_size=batch_size,
                             sampler=SequentialSampler(test_dataset),
                             num_workers=num_workers)

    return train_loader, validation_loader, test_loader
Beispiel #4
0
def gen(num_sample=1000, num_lines=20):
    c = Corpus(open('lyrics_out.txt').read())

    with open('trainingdata.tar', 'rb') as f:
        model = load(f).model

    generator = model.top_bricks[0]

    sample = ComputationGraph(
        generator.generate(n_steps=num_sample, batch_size=1,
                           iterate=True)).get_theano_function()

    output_char_int = sample()[6][:, 0]

    output = "\n".join(
        ("".join(c.decode(output_char_int))).splitlines()[0:num_lines])
    return output
Beispiel #5
0
def gen(num_sample=1000,num_lines=20):
    c = Corpus(open('lyrics_out.txt').read())

    with open('trainingdata.tar', 'rb') as f:
        model = load(f).model

    generator = model.top_bricks[0]

    sample = ComputationGraph(generator.generate(
        n_steps=num_sample,
        batch_size=1,
        iterate=True
    )).get_theano_function()

    output_char_int = sample()[6][:,0]

    output = "\n".join(("".join(c.decode(output_char_int))).splitlines()[0:num_lines])
    return output
Beispiel #6
0
def load_corpus(args):
    # args = get_cmd_args()
    # prepare the whole corpus
    data_dir = args.directory + 'data/' + args.dataset + '.PKL'
    if args.random_group:  # randomly split the books to groups and save Train/Test
        with open(args.directory + 'data/GT_corpus.PKL', 'rb') as f:
            corpus_dict = pickle.load(f, encoding='bytes')

        corpus = Corpus(corpus_dict)
        Train_groups, Test_groups = corpus.get_pairs_by_group(
            number=args.random_group)
        # save the training and testing data.

        with open(data_dir, 'wb') as f:
            pickle.dump((Train_groups, Test_groups), f)
            print('saved split train/test data to {}.'.format(args.dataset))

    else:
        with open(data_dir, 'rb') as f:
            Train_groups, Test_groups = pickle.load(f)

    return Train_groups, Test_groups
Beispiel #7
0
def direct_load_dataset(data_dir,
                        dataset,
                        tokenizer,
                        max_sequence_length,
                        random_sequence_length=False):

    data_corpus = Corpus(dataset, data_dir=data_dir, single_file=True)

    data_list = data_corpus.data
    label_list = data_corpus.label

    validation_dataset = EncodedSingleDataset(data_list, label_list, tokenizer,
                                              max_sequence_length)

    validation_loader = DataLoader(validation_dataset)

    return validation_loader
Beispiel #8
0
# !/usr/bin/env python
# -*- coding: utf-8 -*-

# -------------------------------------------#
# main.py 									 #
# author: sean lee						   	 #
# email: [email protected]					 #
# --------------------------------------------#

import argparse

parser = argparse.ArgumentParser(description='main.py')
parser.add_argument('-train', action='store_true', default=False, help='train model')
parser.add_argument('-test', action='store_true', default=False, help='test model')
args = parser.parse_args()

from dataset import Corpus, load_data
from skipgram_softmax import Skipgram

if __name__ == '__main__':

    data = list(load_data())
    corpus = Corpus(data)
    skipgram = Skipgram(corpus)

    if args.train:
        skipgram.train()
    elif args.test:
        word = input('Input word> ')
        print(skipgram.test(word))
Beispiel #9
0
    v_init = get_var_from("initial_state", model.shared_variables)
    v_states = get_var_from("H_apply_states", model.intermediary_variables)
    #v_states  = get_var_from("H_apply_states",model.intermediary_variables)

    f = theano.function([v_inchar],
                        v_softmax,
                        updates=[(v_init, v_states[0][0])])
    #f = theano.function([v_inchar], v_softmax)

    seq = [init_char]
    for _ in xrange(num_chars):
        dist = f(np.atleast_2d(seq[-1]).astype(np.int32))[0]
        sample = np.random.choice(vocab_size, 1, p=dist)[0]
        seq.append(sample)
    #print seq
    return seq


def sample_text(model, num_chars, corpus):
    return "".join(
        corpus.decode(sample_chars(model, num_chars, corpus.vocab_size())))


corpus = Corpus(open("corpus.txt").read())

args = getArguments()
main_loop = load(args.model)
model = main_loop.model

#print sample_text(model, args.sample_size, corpus)
Beispiel #10
0
import torch
import torch.nn as nn
import numpy as np
from models import Autoencoder, Generator
from dataset import Corpus

#####################
# Generating data
#####################

ds = Corpus()
vocab = ds.vocab

generator = Generator(20, 100)
generator.eval()
generator.load_state_dict(torch.load('generator.th', map_location='cpu'))

autoencoder = Autoencoder(100, 600, 200, 100, vocab.size(), 0.5, 22)
autoencoder.eval()
autoencoder.load_state_dict(torch.load('autoencoder.th', map_location='cpu'))

# sample noise
noise = torch.FloatTensor(np.random.normal(0, 1, (1, 100)))
z = generator(noise[None,:,:])

# create new sent
logits = autoencoder.decode(z).squeeze()
seq = logits.argmax(dim=0)
print(ds.decode(seq))
Beispiel #11
0
def weights_init(m):
    classname = m.__class__.__name__
    if classname == 'LSTM':
        nn.init.orthogonal_(m.weight_ih_l0)
        nn.init.orthogonal_(m.weight_hh_l0)
        nn.init.orthogonal_(m.weight_ih_l1)
        nn.init.orthogonal_(m.weight_hh_l1)


label_size = 8
batch_size = 64
learning_rate = 0.001
epochs = 10
chapters = choose_chapters2()
cp = Corpus(chapters)
train_set = TextDataset(cp, train=True)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
test_set = TextDataset(cp, train=False)
test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True)
rnn = 'gru'

if rnn == 'lstm':
    model = LSTMNet(512, 128, vocab_size=len(cp.vocab), label_size=label_size, batch_size=batch_size).cuda()
    model.apply(weights_init)
else:
    model = GRUNet(512, 128, vocab_size=len(cp.vocab), label_size=label_size, batch_size=batch_size).cuda()

optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4)
loss_function = nn.CrossEntropyLoss()
Beispiel #12
0
from blocks.graph import ComputationGraph

from fuel.streams import DataStream
from fuel.schemes import SequentialScheme

from blocks.extensions import FinishAfter, Printing, ProgressBar
#from blocks.extensions.saveload import load
from blocks.serialization import load
from blocks.monitoring import aggregation  # ???

from dataset import Corpus, createDataset

args = getArguments()

corpus = Corpus(open(args.corpus).read())
train_data, vocab_size = createDataset(corpus=corpus,
                                       sequence_length=750,
                                       repeat=20)

if args.mode == "train":
    seq_len = 100
    dim = 100
    feedback_dim = 100

    # Build the bricks and initialize them
    transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh())
    generator = SequenceGenerator(
        Readout(
            readout_dim=vocab_size,
            source_names=["states"],  # transition.apply.states ???
Beispiel #13
0
from fuel.schemes import SequentialScheme

from blocks.extensions import FinishAfter, Printing, ProgressBar
from blocks.extensions.saveload import Checkpoint, load

from dataset import Corpus, createDataset
from rnn_model import create_rnn

if __name__ == "__main__":
    args = parser.parse_args()

    if args.retrain:
        main_loop = load(args.model)
    else:
        # create Corpus and Dateset
        corpus = Corpus(open(args.corpus).read())
        train_data, vocab_size = createDataset(corpus=corpus,
                                               sequence_length=750,
                                               repeat=20)
        # create Computation Graph
        cg, layers, y_hat, cost = create_rnn(args.hidden,
                                             vocab_size,
                                             mode=args.mode)
        # create training loop
        main_loop = MainLoop(
            data_stream=DataStream(
                train_data,
                iteration_scheme=SequentialScheme(train_data.num_examples,
                                                  batch_size=50)),
            algorithm=GradientDescent(cost=cost,
                                      parameters=cg.parameters,
                    default='checkpoint',
                    help='dir of checkpoint')
parser.add_argument('--resume',
                    action='store_true',
                    default=False,
                    help='resume')
parser.add_argument('--valid',
                    action='store_true',
                    default=True,
                    help='is valid')

args = parser.parse_args()

import os
from dataset import load_data, Corpus, Dataset

if args.mode == 'train':
    from trainer import Trainer

    datas, labels = load_data('./corpus/TREC.train')
    corpus = Corpus(datas, labels)

    valid_datas, valid_labels = load_data('./corpus/TREC.test')
    dataset = {
        'train': Dataset(corpus, datas, labels),
        'valid': Dataset(corpus, valid_datas, valid_labels)
    }
    args.vocab_size = corpus.vocab_size
    args.label_size = corpus.label_size
    trainer = Trainer(args)
    trainer.train(dataset)
Beispiel #15
0
from blocks.graph import ComputationGraph

from fuel.streams import DataStream
from fuel.schemes import SequentialScheme

from blocks.extensions import FinishAfter, Printing, ProgressBar
#from blocks.extensions.saveload import load
from blocks.serialization import load
from blocks.monitoring import aggregation # ???

from dataset import Corpus, createDataset

args = getArguments()

corpus = Corpus(open(args.corpus).read())
train_data,vocab_size = createDataset(
            corpus = corpus,
            sequence_length = 750,
            repeat = 20
        )

if args.mode == "train":
    seq_len = 100
    dim = 100
    feedback_dim = 100

    # Build the bricks and initialize them
    transition = GatedRecurrent(name="transition", dim=dim,
                                activation=Tanh())
    generator = SequenceGenerator(