def load_dataset(src, trg, batch_size, max_size=100000, min_freq=5, gpu=False, shuffle=True, sort_key=default_sort_key, **kwargs): """ Wrapper function for dataset with sensible, overwritable defaults """ tweets_dict = Dict(pad_token='<pad>', eos_token='<eos>', bos_token='<bos>', max_size=max_size, min_freq=min_freq) labels_dict = Dict(sequential=False, force_unk=False) tweets_dict.fit(src) labels_dict.fit(trg) d = {'src': tweets_dict, 'trg': labels_dict} splits = PairedDataset(src, trg, d, batch_size, gpu=gpu).splits(shuffle=shuffle, sort_key=sort_key, **kwargs) return splits
def load_from_lines( path, batch_size, max_size=1000000, min_freq=5, gpu=False, shuffle=True, sort_key=lambda x: len(x[0]), **kwargs): lines = load_lines(path) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) ldict.fit(lines) mock_labels = make_mock_labels(train) mock = Dict() mock.fit(mock_labels) d = {'src': ldict, 'trg': mock} splits = PairedDataset(lines, mock_labels, d, batch_size, gpu=gpu).splits( shuffle=shuffle, sort_key=sort_key, **kwargs) return splits
def load_penn(path, batch_size, max_size=1000000, min_freq=1, gpu=False, shuffle=True): train_data = load_lines(os.path.join(path, 'train.txt')) valid_data = load_lines(os.path.join(path, 'valid.txt')) test_data = load_lines(os.path.join(path, 'test.txt')) d = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) d.fit(train_data, valid_data) train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu) valid = PairedDataset(valid_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) test = PairedDataset(test_data, None, {'src': d}, batch_size, gpu=gpu, evaluation=True) return train.sort_(), valid.sort_(), test.sort_()
def setUp(self): text = [lorem.sentence() for _ in range(100)] self.d = Dict(pad_token=u.PAD).fit(text) self.corpus = list(self.d.transform(text)) self.num_words = sum(len(s.split()) for s in text) self.word_lengths = [len(s.split()) for s in text] self.max_word_len = max(len(w) for s in text for w in s.split()) self.max_seq_words = max(len(s.split()) for s in text)
def viz_encoder_decoder(**kwargs): from modules.encoder_decoder import EncoderDecoder num_layers, emb_dim, hid_dim, att_dim = 1, 12, 16, 16 d = Dict(pad_token='<pad>').fit(['a']) m = EncoderDecoder(num_layers, emb_dim, hid_dim, att_dim, d, **kwargs) src, trg = torch.LongTensor([[0, 1]]), torch.LongTensor([[0, 1]]) out = m(Variable(src), Variable(trg)) return make_dot(out)
def viz_vae(**kwargs): from vae import SequenceVAE d = Dict(pad_token='<pad>').fit(['a']) num_layers, emb_dim, hid_dim, z_dim = 1, 12, 16, 16 m = SequenceVAE(num_layers, emb_dim, hid_dim, z_dim, d) src = Variable(torch.LongTensor([[0, 1]])) logs, mu, logvar = m(src, src) z = m.encoder.reparametrize(mu, logvar) return make_dot(logs), make_dot(mu), make_dot(logvar), make_dot(z)
def load_penn(path, batch_size, max_size=1000000, min_freq=1, gpu=False, shuffle=True, sort_key=lambda pair: len(pair[0])): train_data = load_lines(os.path.join(path, 'train.txt')) train_labels = make_mock_labels(train_data) valid_data = load_lines(os.path.join(path, 'valid.txt')) valid_labels = make_mock_labels(valid_data) test_data = load_lines(os.path.join(path, 'test.txt')) test_labels = make_mock_labels(test_data) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq) ldict.fit(train_data, valid_data) mock = Dict().fit(train_labels) d = {'src': ldict, 'trg': mock} train = PairedDataset(train_data, train_labels, d, batch_size, gpu=gpu ).sort_(sort_key=sort_key) valid = PairedDataset(valid_data, valid_labels, d, batch_size, gpu=gpu, evaluation=True).sort_(sort_key=sort_key) test = PairedDataset(test_data, test_labels, d, batch_size, gpu=gpu, evaluation=True).sort_(sort_key=sort_key) return train, valid, test
def load_from_lines(path, batch_size, max_size=1000000, min_freq=5, gpu=False, shuffle=True, **kwargs): lines = load_lines(path) ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq).fit(lines) return PairedDataset(lines, None, { 'src': ldict }, batch_size, gpu=gpu).splits(shuffle=shuffle, **kwargs)
def load_split_data(path, batch_size, max_size, min_freq, max_len, device, processor): """ Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt' """ train = load_lines(os.path.join(path, 'train.txt'), max_len, processor) valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor) d = Dict( pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=max_size, min_freq=min_freq, force_unk=True ).fit(train, valid) train = load_lines(os.path.join(path, 'train.txt'), max_len, processor) valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor) test = load_lines(os.path.join(path, 'test.txt'), max_len, processor) train = PairedDataset(train, None, {'src': d}, batch_size, device=device) valid = PairedDataset(valid, None, {'src': d}, batch_size, device=device) test = PairedDataset(test, None, {'src': d}, batch_size, device=device) return train.sort_(), valid.sort_(), test.sort_()
if args.processed: print("Loading preprocessed datasets...") assert args.dict_path, "Processed data requires DICT_PATH" data, d = load_from_file(args.path), u.load_model(args.dict_path) train, test, valid = BlockDataset( data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True ).splits(test=0.1, dev=0.1) del data else: print("Processing datasets...") proc = text_processor( lower=args.lower, num=args.num, level=args.level) train_data = load_lines(args.path + 'train.txt', processor=proc) valid_data = load_lines(args.path + 'valid.txt', processor=proc) test_data = load_lines(args.path + 'test.txt', processor=proc) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS) d.fit(train_data, valid_data) train = BlockDataset( train_data, d, args.batch_size, args.bptt, gpu=args.gpu) valid = BlockDataset( valid_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) test = BlockDataset( test_data, d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) del train_data, valid_data, test_data print(' * vocabulary size. %d' % len(d)) print(' * number of train batches. %d' % len(train)) print('Building model...')
parser.add_argument('--max_seq_len', default=25, type=int) parser.add_argument('--temperature', default=1, type=float) parser.add_argument('--checkpoint', default=200, type=int) parser.add_argument('--hooks_per_epoch', default=5, type=int) parser.add_argument('--log_checkpoints', action='store_true') parser.add_argument('--visdom_server', default='localhost') parser.add_argument('--save', action='store_true') args = parser.parse_args() if args.load_data: train, test, d, table = u.load_model(args.data_path) lang_d, *conds_d = d else: print("Fitting dictionaries") lang_d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS) conds_d = [Dict(sequential=False, force_unk=False) for _ in range(2)] linesiter = readlines(os.path.join(args.path, 'train.csv')) train_labels, train_lines = zip(*linesiter) print("Fitting language Dict") lang_d.fit(train_lines) print("Fitting condition Dicts") for d, cond in zip(conds_d, zip(*train_labels)): d.fit([cond]) print("Processing datasets") print("Processing train") table = CompressionTable(len(conds_d)) train = examples_from_lines(train_lines, train_labels,
size = args.train_len batch_size = args.batch_size sample_fn = getattr(d, args.sample_fn) if args.path != '': with open(args.path, 'rb+') as f: dataset = PairedDataset.from_disk(f) dataset.set_batch_size(args.batch_size) dataset.set_gpu(args.gpu) train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None) src_dict = dataset.dicts['src'] else: src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len, sample_fn)) src, trg = list(map(list, src)), list(map(list, trg)) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS) src_dict.fit(src, trg) train, valid = PairedDataset(src, trg, { 'src': src_dict, 'trg': src_dict }, batch_size=args.batch_size, gpu=args.gpu).splits(dev=args.dev, test=None, sort_by='src') print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % batch_size)
# Paths parser.add_argument('--path', required=True, help='Path to the a directory ' 'containing source and target text files') parser.add_argument('--pretrained', type=str, default='empty') # Logging parser.add_argument('--gen_src', default=None) parser.add_argument('--gen_tgt', default=None) parser.add_argument('--csv', type=str, default='empty') parser.add_argument('--logging', action='store_true') parser.add_argument('--visdom', action='store_true') args = parser.parse_args() src, trg = load_data(args.path, ('.answers', '.questions')) src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS, max_size=args.max_size, min_freq=args.min_freq) src_dict.fit(src, trg) train, valid = PairedDataset( src, trg, {'src': src_dict, 'trg': src_dict}, batch_size=args.batch_size, gpu=args.gpu ).splits(dev=args.dev, test=None, sort_key=lambda pair: len(pair[0])) print(' * vocabulary size. %d' % len(src_dict)) print(' * number of train batches. %d' % len(train)) print(' * maximum batch size. %d' % args.batch_size) print('Building model...') model = EncoderDecoder( # removed (args.hid_dim, args.hid_dim) added args.hid_dim (args.layers, args.layers), args.emb_dim, args.hid_dim, args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout,
words = self.d.vocab weight, words = EmbeddingLoader(filepath, mode).load(words, **kwargs) self.init_embeddings(weight, words) if __name__ == '__main__': from seqmod.misc.dataset import Dict import inspect import collections text = [] for _, func in inspect.getmembers(collections): doc = func.__doc__ if doc is not None: text.extend([l.split() for l in doc.split('\n')]) d = Dict().fit(text) emb = Embedding.from_dict(d, 100, p=0.2) filepath = 'test/data/glove.test1000.100d.txt' emb.init_embeddings_from_file(filepath, 'glove') weights, words = EmbeddingLoader(filepath, 'glove').load() weights = torch.Tensor(weights) by_word = dict(zip(words, weights)) for weight, word in zip(emb.weight.data, emb.d.vocab): if word in by_word: assert torch.equal(weight, by_word[word]) inp = Variable(torch.LongTensor(10).random_(emb.num_embeddings))
import argparse parser = argparse.ArgumentParser() parser.add_argument('path') parser.add_argument('--output_file', type=str, default="processed") parser.add_argument('--min_freq', type=int, default=1) parser.add_argument('--max_size', type=int, default=None) parser.add_argument('--bos_token', type=str, default='<bos>') parser.add_argument('--eos_token', type=str, default='<eos>') parser.add_argument('--max_buffer_size', type=int, default=100000) parser.add_argument('--num', action='store_true') parser.add_argument('--lower', action='store_true') parser.add_argument('--level', default='token') args = parser.parse_args() extractor = Dict( max_size=args.max_size, min_freq=args.min_freq, bos_token=args.bos_token, eos_token=args.eos_token) processor = text_processor( num=args.num, lower=args.lower, level=args.level) if os.path.isfile(args.path): files = [args.path] else: files = [os.path.join(args.path, f) for f in os.listdir(args.path)] start = time.time() print("Fitting vocabulary") for subset in process_files(files, processor, args.max_buffer_size): extractor.partial_fit(subset) extractor.fit()
parser.add_argument('--save_path', default='models', type=str) args = parser.parse_args() # process data if args.processed: print("Loading preprocessed datasets...") assert args.dict_path, "Processed data requires DICT_PATH" data, d = load_from_file(args.path), u.load_model(args.dict_path) train, test, valid = BlockDataset( data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True ).splits(test=args.test_split, dev=args.dev_split) del data else: print("Processing datasets...") proc = text_processor(lower=args.lower, num=args.num, level=args.level) d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS, force_unk=True) train, valid, test = None, None, None # already split if os.path.isfile(os.path.join(args.path, 'train.txt')): if not os.path.isfile(os.path.join(args.path, 'valid.txt')): raise ValueError("train.txt requires test.txt") train_data = load_lines( os.path.join(args.path, 'train.txt'), processor=proc) d.fit(train_data) train = BlockDataset( train_data, d, args.batch_size, args.bptt, gpu=args.gpu) del train_data test = BlockDataset( load_lines(os.path.join(args.path, 'test.txt'), proc), d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True) if os.path.isfile(os.path.join(args.path, 'valid.txt')):
inps, lengths = [], [] num_batches = len(corpus) // batch_size prev = 0 for b in range(num_batches): to = min((b + 1) * batch_size, len(corpus) - 1) inp, length = pad_sequential_batch(corpus[prev:to], d.get_pad(), True, False) inps.append(torch.tensor(inp)) lengths.append(length) prev = to return inps, lengths char_text = [generate_sent() for _ in range(100)] word_text = [s.split() for s in char_text] char_d = Dict(pad_token=PAD).fit(char_text) word_d = Dict(pad_token=PAD).fit(word_text) char_inps, char_lengths = create_batches(10, char_text, char_d) word_inps, word_lengths = create_batches(10, word_text, word_d) n_char_inps = sum(sum(l) for l in char_lengths) n_word_inps = sum(sum(l) for l in word_lengths) def make_word_embedding_runner(embedding): def runner(): for word_inp in word_inps: embedding(word_inp) return runner def make_char_embedding_runner(embedding):
def shingle_dataset(args, vocab_dict=None, focus_size=None, right_size=None): if focus_size: args.focus_size = focus_size if right_size: args.right_size = right_size # load the data: if args.task == 'sentences': dataset = list( SentenceCouples(args.input, max_items=args.max_items, tokenize=args.tokenize, level=args.level)) print(f'* loaded {len(dataset)} sentences') elif args.task == 'snippets': dataset = list( SnippetCouples(args.input, focus_size=args.focus_size, right_size=args.right_size, max_items=args.max_items)) print(f'* loaded {len(dataset)} snippets') else: raise ValueError("`Task` should be one of ('sentences', 'snippets')") # random shuffle: if args.shuffle: print('* shuffling batches...') random.seed(args.rnd_seed) random.shuffle(dataset) for c in dataset[:10]: print('\t'.join(' '.join(s[:10]) for s in c)) if vocab_dict is None: vocab_dict = Dict(pad_token=u.PAD, bos_token=u.BOS, eos_token=u.EOS, min_freq=args.min_item_freq, sequential=True, force_unk=True, max_size=args.max_vocab_size) focus, right = zip(*dataset) del dataset if not vocab_dict.fitted: vocab_dict.fit( focus, right ) # sometimes inefficient? # do a partial fit in the triple store? train, valid = PairedDataset(src=(focus, ), trg=(right, ), d={ 'src': (vocab_dict, ), 'trg': (vocab_dict, ) }, batch_size=args.batch_size, gpu=args.gpu, align_right=args.reverse, fitted=False).splits(sort_by='src', dev=args.dev, test=None, sort=True) return train, valid, vocab_dict
parser.add_argument('--bptt', default=50, type=int) parser.add_argument('--epochs', default=10, type=int) parser.add_argument('--optim', default='Adam') parser.add_argument('--lr', default=0.001, type=float) parser.add_argument('--hooks_per_epoch', default=1, type=int) parser.add_argument('--checkpoints', default=20, type=int) parser.add_argument('--gpu', action='store_true') args = parser.parse_args() if args.load_dataset: dataset = BlockDataset.from_disk(args.dataset_path) dataset.set_batch_size(args.batch_size), dataset.set_gpu(args.gpu) else: words, pos = zip(*load_penn3(args.path, swbd=False)) word_dict = Dict(eos_token=u.EOS, bos_token=u.BOS, force_unk=True, max_size=100000) pos_dict = Dict(eos_token=u.EOS, bos_token=u.BOS, force_unk=False) word_dict.fit(words), pos_dict.fit(pos) dataset = BlockDataset((pos, words), (pos_dict, word_dict), args.batch_size, args.bptt) if args.save_dataset and not os.path.isfile(args.dataset_path): dataset.to_disk(args.dataset_path) train, valid = dataset.splits(test=None) pos_dict, word_dict = train.d m = DoubleRNNPOSAwareLM( (len(pos_dict.vocab), len(word_dict.vocab)), # vocabs (args.pos_emb_dim, args.word_emb_dim), (args.pos_hid_dim, args.word_hid_dim),