def load_datasets(data_dir, real_dataset, fake_dataset, tokenizer, batch_size, max_sequence_length, random_sequence_length, epoch_size=None, token_dropout=None, seed=None): real_corpus = Corpus(real_dataset, data_dir=data_dir) if fake_dataset == "TWO": real_train, real_valid = real_corpus.train * 2, real_corpus.valid * 2 fake_corpora = [Corpus(name, data_dir=data_dir) for name in ['grover_fake', 'gpt2_fake']] fake_train = sum([corpus.train for corpus in fake_corpora], []) fake_valid = sum([corpus.valid for corpus in fake_corpora], []) else: fake_corpus = Corpus(fake_dataset, data_dir=data_dir) real_train, real_valid = real_corpus.train, real_corpus.valid fake_train, fake_valid = fake_corpus.train, fake_corpus.valid Sampler = DistributedSampler if distributed() and dist.get_world_size() > 1 else RandomSampler min_sequence_length = 10 if random_sequence_length else None train_dataset = EncodedDataset(real_train, fake_train, tokenizer, max_sequence_length, min_sequence_length, epoch_size, token_dropout, seed) train_loader = DataLoader(train_dataset, batch_size, sampler=Sampler(train_dataset), num_workers=0) validation_dataset = EncodedDataset(real_valid, fake_valid, tokenizer, max_sequence_length, min_sequence_length) validation_loader = DataLoader(validation_dataset, batch_size=1, sampler=Sampler(validation_dataset)) return train_loader, validation_loader
def load_datasets(data_dir, real_dataset, fake_dataset, tokenizer, max_sequence_length, random_sequence_length): real_corpus = Corpus(real_dataset, data_dir=data_dir, single_file=True) if fake_dataset == "TWO": real_train, real_valid = real_corpus.train * 2, real_corpus.valid * 2 fake_corpora = [ Corpus(name, data_dir=data_dir) for name in ['grover_fake', 'gpt2_fake'] ] fake_train = sum([corpus.train for corpus in fake_corpora], []) fake_valid = sum([corpus.valid for corpus in fake_corpora], []) else: fake_corpus = Corpus(fake_dataset, data_dir=data_dir, single_file=True) real_valid = real_corpus.data fake_valid = fake_corpus.data min_sequence_length = 10 if random_sequence_length else None validation_dataset = EncodedDataset(real_valid, fake_valid, tokenizer, max_sequence_length, min_sequence_length) validation_loader = DataLoader(validation_dataset) return validation_loader
def load_datasets(data_dir, real_dataset, fake_dataset, tokenizer, batch_size, max_sequence_length, random_sequence_length, token_dropout=None, seed=None, num_train_pairs=None, num_workers=1): download(real_dataset, fake_dataset, data_dir=data_dir) real_corpus = Corpus(real_dataset, data_dir=data_dir) fake_corpus = Corpus(fake_dataset, data_dir=data_dir) real_train, real_valid, real_test = real_corpus.train, real_corpus.valid, real_corpus.test fake_train, fake_valid, fake_test = fake_corpus.train, fake_corpus.valid, fake_corpus.test if num_train_pairs: real_sample = np.random.choice(len(real_train), num_train_pairs) fake_sample = np.random.choice(len(fake_train), num_train_pairs) real_train = [real_train[i] for i in real_sample] fake_train = [fake_train[i] for i in fake_sample] sampler = SequentialSampler min_sequence_length = 10 if random_sequence_length else None train_dataset = GPT2EncodedDataset(real_train, fake_train, tokenizer, max_sequence_length, min_sequence_length, token_dropout, seed) train_loader = DataLoader(train_dataset, batch_size, sampler=SequentialSampler(train_dataset), num_workers=num_workers) validation_dataset = GPT2EncodedDataset(real_valid, fake_valid, tokenizer, max_sequence_length, min_sequence_length) validation_loader = DataLoader( validation_dataset, batch_size=batch_size, sampler=SequentialSampler(validation_dataset), num_workers=num_workers) test_dataset = GPT2EncodedDataset(real_test, fake_test, tokenizer, max_sequence_length, min_sequence_length) test_loader = DataLoader(test_dataset, batch_size=batch_size, sampler=SequentialSampler(test_dataset), num_workers=num_workers) return train_loader, validation_loader, test_loader
def gen(num_sample=1000, num_lines=20): c = Corpus(open('lyrics_out.txt').read()) with open('trainingdata.tar', 'rb') as f: model = load(f).model generator = model.top_bricks[0] sample = ComputationGraph( generator.generate(n_steps=num_sample, batch_size=1, iterate=True)).get_theano_function() output_char_int = sample()[6][:, 0] output = "\n".join( ("".join(c.decode(output_char_int))).splitlines()[0:num_lines]) return output
def gen(num_sample=1000,num_lines=20): c = Corpus(open('lyrics_out.txt').read()) with open('trainingdata.tar', 'rb') as f: model = load(f).model generator = model.top_bricks[0] sample = ComputationGraph(generator.generate( n_steps=num_sample, batch_size=1, iterate=True )).get_theano_function() output_char_int = sample()[6][:,0] output = "\n".join(("".join(c.decode(output_char_int))).splitlines()[0:num_lines]) return output
def load_corpus(args): # args = get_cmd_args() # prepare the whole corpus data_dir = args.directory + 'data/' + args.dataset + '.PKL' if args.random_group: # randomly split the books to groups and save Train/Test with open(args.directory + 'data/GT_corpus.PKL', 'rb') as f: corpus_dict = pickle.load(f, encoding='bytes') corpus = Corpus(corpus_dict) Train_groups, Test_groups = corpus.get_pairs_by_group( number=args.random_group) # save the training and testing data. with open(data_dir, 'wb') as f: pickle.dump((Train_groups, Test_groups), f) print('saved split train/test data to {}.'.format(args.dataset)) else: with open(data_dir, 'rb') as f: Train_groups, Test_groups = pickle.load(f) return Train_groups, Test_groups
def direct_load_dataset(data_dir, dataset, tokenizer, max_sequence_length, random_sequence_length=False): data_corpus = Corpus(dataset, data_dir=data_dir, single_file=True) data_list = data_corpus.data label_list = data_corpus.label validation_dataset = EncodedSingleDataset(data_list, label_list, tokenizer, max_sequence_length) validation_loader = DataLoader(validation_dataset) return validation_loader
# !/usr/bin/env python # -*- coding: utf-8 -*- # -------------------------------------------# # main.py # # author: sean lee # # email: [email protected] # # --------------------------------------------# import argparse parser = argparse.ArgumentParser(description='main.py') parser.add_argument('-train', action='store_true', default=False, help='train model') parser.add_argument('-test', action='store_true', default=False, help='test model') args = parser.parse_args() from dataset import Corpus, load_data from skipgram_softmax import Skipgram if __name__ == '__main__': data = list(load_data()) corpus = Corpus(data) skipgram = Skipgram(corpus) if args.train: skipgram.train() elif args.test: word = input('Input word> ') print(skipgram.test(word))
v_init = get_var_from("initial_state", model.shared_variables) v_states = get_var_from("H_apply_states", model.intermediary_variables) #v_states = get_var_from("H_apply_states",model.intermediary_variables) f = theano.function([v_inchar], v_softmax, updates=[(v_init, v_states[0][0])]) #f = theano.function([v_inchar], v_softmax) seq = [init_char] for _ in xrange(num_chars): dist = f(np.atleast_2d(seq[-1]).astype(np.int32))[0] sample = np.random.choice(vocab_size, 1, p=dist)[0] seq.append(sample) #print seq return seq def sample_text(model, num_chars, corpus): return "".join( corpus.decode(sample_chars(model, num_chars, corpus.vocab_size()))) corpus = Corpus(open("corpus.txt").read()) args = getArguments() main_loop = load(args.model) model = main_loop.model #print sample_text(model, args.sample_size, corpus)
import torch import torch.nn as nn import numpy as np from models import Autoencoder, Generator from dataset import Corpus ##################### # Generating data ##################### ds = Corpus() vocab = ds.vocab generator = Generator(20, 100) generator.eval() generator.load_state_dict(torch.load('generator.th', map_location='cpu')) autoencoder = Autoencoder(100, 600, 200, 100, vocab.size(), 0.5, 22) autoencoder.eval() autoencoder.load_state_dict(torch.load('autoencoder.th', map_location='cpu')) # sample noise noise = torch.FloatTensor(np.random.normal(0, 1, (1, 100))) z = generator(noise[None,:,:]) # create new sent logits = autoencoder.decode(z).squeeze() seq = logits.argmax(dim=0) print(ds.decode(seq))
def weights_init(m): classname = m.__class__.__name__ if classname == 'LSTM': nn.init.orthogonal_(m.weight_ih_l0) nn.init.orthogonal_(m.weight_hh_l0) nn.init.orthogonal_(m.weight_ih_l1) nn.init.orthogonal_(m.weight_hh_l1) label_size = 8 batch_size = 64 learning_rate = 0.001 epochs = 10 chapters = choose_chapters2() cp = Corpus(chapters) train_set = TextDataset(cp, train=True) train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) test_set = TextDataset(cp, train=False) test_loader = DataLoader(test_set, batch_size=batch_size, shuffle=True) rnn = 'gru' if rnn == 'lstm': model = LSTMNet(512, 128, vocab_size=len(cp.vocab), label_size=label_size, batch_size=batch_size).cuda() model.apply(weights_init) else: model = GRUNet(512, 128, vocab_size=len(cp.vocab), label_size=label_size, batch_size=batch_size).cuda() optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-4) loss_function = nn.CrossEntropyLoss()
from blocks.graph import ComputationGraph from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.extensions import FinishAfter, Printing, ProgressBar #from blocks.extensions.saveload import load from blocks.serialization import load from blocks.monitoring import aggregation # ??? from dataset import Corpus, createDataset args = getArguments() corpus = Corpus(open(args.corpus).read()) train_data, vocab_size = createDataset(corpus=corpus, sequence_length=750, repeat=20) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout( readout_dim=vocab_size, source_names=["states"], # transition.apply.states ???
from fuel.schemes import SequentialScheme from blocks.extensions import FinishAfter, Printing, ProgressBar from blocks.extensions.saveload import Checkpoint, load from dataset import Corpus, createDataset from rnn_model import create_rnn if __name__ == "__main__": args = parser.parse_args() if args.retrain: main_loop = load(args.model) else: # create Corpus and Dateset corpus = Corpus(open(args.corpus).read()) train_data, vocab_size = createDataset(corpus=corpus, sequence_length=750, repeat=20) # create Computation Graph cg, layers, y_hat, cost = create_rnn(args.hidden, vocab_size, mode=args.mode) # create training loop main_loop = MainLoop( data_stream=DataStream( train_data, iteration_scheme=SequentialScheme(train_data.num_examples, batch_size=50)), algorithm=GradientDescent(cost=cost, parameters=cg.parameters,
default='checkpoint', help='dir of checkpoint') parser.add_argument('--resume', action='store_true', default=False, help='resume') parser.add_argument('--valid', action='store_true', default=True, help='is valid') args = parser.parse_args() import os from dataset import load_data, Corpus, Dataset if args.mode == 'train': from trainer import Trainer datas, labels = load_data('./corpus/TREC.train') corpus = Corpus(datas, labels) valid_datas, valid_labels = load_data('./corpus/TREC.test') dataset = { 'train': Dataset(corpus, datas, labels), 'valid': Dataset(corpus, valid_datas, valid_labels) } args.vocab_size = corpus.vocab_size args.label_size = corpus.label_size trainer = Trainer(args) trainer.train(dataset)
from blocks.graph import ComputationGraph from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.extensions import FinishAfter, Printing, ProgressBar #from blocks.extensions.saveload import load from blocks.serialization import load from blocks.monitoring import aggregation # ??? from dataset import Corpus, createDataset args = getArguments() corpus = Corpus(open(args.corpus).read()) train_data,vocab_size = createDataset( corpus = corpus, sequence_length = 750, repeat = 20 ) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(