Example #1
0
def load_dataset(src,
                 trg,
                 batch_size,
                 max_size=100000,
                 min_freq=5,
                 gpu=False,
                 shuffle=True,
                 sort_key=default_sort_key,
                 **kwargs):
    """
    Wrapper function for dataset with sensible, overwritable defaults
    """
    tweets_dict = Dict(pad_token='<pad>',
                       eos_token='<eos>',
                       bos_token='<bos>',
                       max_size=max_size,
                       min_freq=min_freq)
    labels_dict = Dict(sequential=False, force_unk=False)
    tweets_dict.fit(src)
    labels_dict.fit(trg)
    d = {'src': tweets_dict, 'trg': labels_dict}
    splits = PairedDataset(src, trg, d, batch_size,
                           gpu=gpu).splits(shuffle=shuffle,
                                           sort_key=sort_key,
                                           **kwargs)
    return splits
Example #2
0
def load_from_lines(
        path, batch_size, max_size=1000000, min_freq=5, gpu=False,
        shuffle=True, sort_key=lambda x: len(x[0]), **kwargs):
    lines = load_lines(path)
    ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                 max_size=max_size, min_freq=min_freq)
    ldict.fit(lines)
    mock_labels = make_mock_labels(train)
    mock = Dict()
    mock.fit(mock_labels)
    d = {'src': ldict, 'trg': mock}
    splits = PairedDataset(lines, mock_labels, d, batch_size, gpu=gpu).splits(
        shuffle=shuffle, sort_key=sort_key, **kwargs)
    return splits
Example #3
0
def load_penn(path,
              batch_size,
              max_size=1000000,
              min_freq=1,
              gpu=False,
              shuffle=True):
    train_data = load_lines(os.path.join(path, 'train.txt'))
    valid_data = load_lines(os.path.join(path, 'valid.txt'))
    test_data = load_lines(os.path.join(path, 'test.txt'))

    d = Dict(pad_token=u.PAD,
             eos_token=u.EOS,
             bos_token=u.BOS,
             max_size=max_size,
             min_freq=min_freq)
    d.fit(train_data, valid_data)

    train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu)
    valid = PairedDataset(valid_data,
                          None, {'src': d},
                          batch_size,
                          gpu=gpu,
                          evaluation=True)
    test = PairedDataset(test_data,
                         None, {'src': d},
                         batch_size,
                         gpu=gpu,
                         evaluation=True)

    return train.sort_(), valid.sort_(), test.sort_()
Example #4
0
 def setUp(self):
     text = [lorem.sentence() for _ in range(100)]
     self.d = Dict(pad_token=u.PAD).fit(text)
     self.corpus = list(self.d.transform(text))
     self.num_words = sum(len(s.split()) for s in text)
     self.word_lengths = [len(s.split()) for s in text]
     self.max_word_len = max(len(w) for s in text for w in s.split())
     self.max_seq_words = max(len(s.split()) for s in text)
Example #5
0
def viz_encoder_decoder(**kwargs):
    from modules.encoder_decoder import EncoderDecoder
    num_layers, emb_dim, hid_dim, att_dim = 1, 12, 16, 16
    d = Dict(pad_token='<pad>').fit(['a'])
    m = EncoderDecoder(num_layers, emb_dim, hid_dim, att_dim, d, **kwargs)
    src, trg = torch.LongTensor([[0, 1]]), torch.LongTensor([[0, 1]])
    out = m(Variable(src), Variable(trg))
    return make_dot(out)
Example #6
0
def viz_vae(**kwargs):
    from vae import SequenceVAE
    d = Dict(pad_token='<pad>').fit(['a'])
    num_layers, emb_dim, hid_dim, z_dim = 1, 12, 16, 16
    m = SequenceVAE(num_layers, emb_dim, hid_dim, z_dim, d)
    src = Variable(torch.LongTensor([[0, 1]]))
    logs, mu, logvar = m(src, src)
    z = m.encoder.reparametrize(mu, logvar)
    return make_dot(logs), make_dot(mu), make_dot(logvar), make_dot(z)
Example #7
0
def load_penn(path, batch_size,
              max_size=1000000, min_freq=1, gpu=False, shuffle=True,
              sort_key=lambda pair: len(pair[0])):
    train_data = load_lines(os.path.join(path, 'train.txt'))
    train_labels = make_mock_labels(train_data)
    valid_data = load_lines(os.path.join(path, 'valid.txt'))
    valid_labels = make_mock_labels(valid_data)
    test_data = load_lines(os.path.join(path, 'test.txt'))
    test_labels = make_mock_labels(test_data)
    ldict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                 max_size=max_size, min_freq=min_freq)
    ldict.fit(train_data, valid_data)
    mock = Dict().fit(train_labels)
    d = {'src': ldict, 'trg': mock}
    train = PairedDataset(train_data, train_labels, d, batch_size, gpu=gpu
    ).sort_(sort_key=sort_key)
    valid = PairedDataset(valid_data, valid_labels, d, batch_size, gpu=gpu,
                          evaluation=True).sort_(sort_key=sort_key)
    test = PairedDataset(test_data, test_labels, d, batch_size, gpu=gpu,
                         evaluation=True).sort_(sort_key=sort_key)
    return train, valid, test
Example #8
0
def load_from_lines(path,
                    batch_size,
                    max_size=1000000,
                    min_freq=5,
                    gpu=False,
                    shuffle=True,
                    **kwargs):
    lines = load_lines(path)

    ldict = Dict(pad_token=u.PAD,
                 eos_token=u.EOS,
                 bos_token=u.BOS,
                 max_size=max_size,
                 min_freq=min_freq).fit(lines)

    return PairedDataset(lines, None, {
        'src': ldict
    }, batch_size, gpu=gpu).splits(shuffle=shuffle, **kwargs)
Example #9
0
def load_split_data(path, batch_size, max_size, min_freq, max_len, device, processor):
    """
    Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt'
    """
    train = load_lines(os.path.join(path, 'train.txt'), max_len, processor)
    valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor)

    d = Dict(
        pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
        max_size=max_size, min_freq=min_freq, force_unk=True
    ).fit(train, valid)

    train = load_lines(os.path.join(path, 'train.txt'), max_len, processor)
    valid = load_lines(os.path.join(path, 'valid.txt'), max_len, processor)
    test = load_lines(os.path.join(path, 'test.txt'), max_len, processor)
    train = PairedDataset(train, None, {'src': d}, batch_size, device=device)
    valid = PairedDataset(valid, None, {'src': d}, batch_size, device=device)
    test = PairedDataset(test, None, {'src': d}, batch_size, device=device)

    return train.sort_(), valid.sort_(), test.sort_()
Example #10
0
    if args.processed:
        print("Loading preprocessed datasets...")
        assert args.dict_path, "Processed data requires DICT_PATH"
        data, d = load_from_file(args.path), u.load_model(args.dict_path)
        train, test, valid = BlockDataset(
            data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True
        ).splits(test=0.1, dev=0.1)
        del data
    else:
        print("Processing datasets...")
        proc = text_processor(
            lower=args.lower, num=args.num, level=args.level)
        train_data = load_lines(args.path + 'train.txt', processor=proc)
        valid_data = load_lines(args.path + 'valid.txt', processor=proc)
        test_data = load_lines(args.path + 'test.txt', processor=proc)
        d = Dict(max_size=args.max_size, min_freq=args.min_freq, eos_token=u.EOS)
        d.fit(train_data, valid_data)
        train = BlockDataset(
            train_data, d, args.batch_size, args.bptt, gpu=args.gpu)
        valid = BlockDataset(
            valid_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        test = BlockDataset(
            test_data, d, args.batch_size, args.bptt, gpu=args.gpu,
            evaluation=True)
        del train_data, valid_data, test_data

    print(' * vocabulary size. %d' % len(d))
    print(' * number of train batches. %d' % len(train))

    print('Building model...')
Example #11
0
    parser.add_argument('--max_seq_len', default=25, type=int)
    parser.add_argument('--temperature', default=1, type=float)
    parser.add_argument('--checkpoint', default=200, type=int)
    parser.add_argument('--hooks_per_epoch', default=5, type=int)
    parser.add_argument('--log_checkpoints', action='store_true')
    parser.add_argument('--visdom_server', default='localhost')
    parser.add_argument('--save', action='store_true')
    args = parser.parse_args()

    if args.load_data:
        train, test, d, table = u.load_model(args.data_path)
        lang_d, *conds_d = d
    else:
        print("Fitting dictionaries")
        lang_d = Dict(max_size=args.max_size,
                      min_freq=args.min_freq,
                      eos_token=u.EOS)
        conds_d = [Dict(sequential=False, force_unk=False) for _ in range(2)]
        linesiter = readlines(os.path.join(args.path, 'train.csv'))
        train_labels, train_lines = zip(*linesiter)
        print("Fitting language Dict")
        lang_d.fit(train_lines)
        print("Fitting condition Dicts")
        for d, cond in zip(conds_d, zip(*train_labels)):
            d.fit([cond])

        print("Processing datasets")
        print("Processing train")
        table = CompressionTable(len(conds_d))
        train = examples_from_lines(train_lines,
                                    train_labels,
Example #12
0
    size = args.train_len
    batch_size = args.batch_size
    sample_fn = getattr(d, args.sample_fn)

    if args.path != '':
        with open(args.path, 'rb+') as f:
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_gpu(args.gpu)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        src, trg = zip(*d.generate_set(size, vocab, args.min_len, args.max_len,
                                       sample_fn))
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        train, valid = PairedDataset(src,
                                     trg, {
                                         'src': src_dict,
                                         'trg': src_dict
                                     },
                                     batch_size=args.batch_size,
                                     gpu=args.gpu).splits(dev=args.dev,
                                                          test=None,
                                                          sort_by='src')

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
    print(' * maximum batch size. %d' % batch_size)
Example #13
0
    # Paths
    parser.add_argument('--path', required=True,
                        help='Path to the a directory '
                        'containing source and target text files')
    parser.add_argument('--pretrained', type=str, default='empty')

    # Logging
    parser.add_argument('--gen_src', default=None)
    parser.add_argument('--gen_tgt', default=None)
    parser.add_argument('--csv', type=str, default='empty')
    parser.add_argument('--logging', action='store_true')
    parser.add_argument('--visdom', action='store_true')
    args = parser.parse_args()

    src, trg = load_data(args.path, ('.answers', '.questions'))
    src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS,
                    max_size=args.max_size, min_freq=args.min_freq)
    src_dict.fit(src, trg)
    train, valid = PairedDataset(
        src, trg, {'src': src_dict, 'trg': src_dict},
        batch_size=args.batch_size, gpu=args.gpu
    ).splits(dev=args.dev, test=None, sort_key=lambda pair: len(pair[0]))

    print(' * vocabulary size. %d' % len(src_dict))
    print(' * number of train batches. %d' % len(train))
    print(' * maximum batch size. %d' % args.batch_size)

    print('Building model...')
    model = EncoderDecoder(
        # removed (args.hid_dim, args.hid_dim) added args.hid_dim
        (args.layers, args.layers), args.emb_dim, args.hid_dim,
        args.att_dim, src_dict, att_type=args.att_type, dropout=args.dropout,
Example #14
0
        words = self.d.vocab
        weight, words = EmbeddingLoader(filepath, mode).load(words, **kwargs)
        self.init_embeddings(weight, words)


if __name__ == '__main__':
    from seqmod.misc.dataset import Dict
    import inspect
    import collections

    text = []
    for _, func in inspect.getmembers(collections):
        doc = func.__doc__
        if doc is not None:
            text.extend([l.split() for l in doc.split('\n')])

    d = Dict().fit(text)
    emb = Embedding.from_dict(d, 100, p=0.2)
    filepath = 'test/data/glove.test1000.100d.txt'
    emb.init_embeddings_from_file(filepath, 'glove')

    weights, words = EmbeddingLoader(filepath, 'glove').load()
    weights = torch.Tensor(weights)
    by_word = dict(zip(words, weights))

    for weight, word in zip(emb.weight.data, emb.d.vocab):
        if word in by_word:
            assert torch.equal(weight, by_word[word])

    inp = Variable(torch.LongTensor(10).random_(emb.num_embeddings))
Example #15
0
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('path')
    parser.add_argument('--output_file', type=str, default="processed")
    parser.add_argument('--min_freq', type=int, default=1)
    parser.add_argument('--max_size', type=int, default=None)
    parser.add_argument('--bos_token', type=str, default='<bos>')
    parser.add_argument('--eos_token', type=str, default='<eos>')
    parser.add_argument('--max_buffer_size', type=int, default=100000)
    parser.add_argument('--num', action='store_true')
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--level', default='token')
    args = parser.parse_args()

    extractor = Dict(
        max_size=args.max_size, min_freq=args.min_freq,
        bos_token=args.bos_token, eos_token=args.eos_token)

    processor = text_processor(
        num=args.num, lower=args.lower, level=args.level)

    if os.path.isfile(args.path):
        files = [args.path]
    else:
        files = [os.path.join(args.path, f) for f in os.listdir(args.path)]

    start = time.time()
    print("Fitting vocabulary")
    for subset in process_files(files, processor, args.max_buffer_size):
        extractor.partial_fit(subset)
    extractor.fit()
Example #16
0
    parser.add_argument('--save_path', default='models', type=str)
    args = parser.parse_args()

    # process data
    if args.processed:
        print("Loading preprocessed datasets...")
        assert args.dict_path, "Processed data requires DICT_PATH"
        data, d = load_from_file(args.path), u.load_model(args.dict_path)
        train, test, valid = BlockDataset(
            data, d, args.batch_size, args.bptt, gpu=args.gpu, fitted=True
        ).splits(test=args.test_split, dev=args.dev_split)
        del data
    else:
        print("Processing datasets...")
        proc = text_processor(lower=args.lower, num=args.num, level=args.level)
        d = Dict(max_size=args.max_size, min_freq=args.min_freq,
                 eos_token=u.EOS, force_unk=True)
        train, valid, test = None, None, None
        # already split
        if os.path.isfile(os.path.join(args.path, 'train.txt')):
            if not os.path.isfile(os.path.join(args.path, 'valid.txt')):
                raise ValueError("train.txt requires test.txt")
            train_data = load_lines(
                os.path.join(args.path, 'train.txt'), processor=proc)
            d.fit(train_data)
            train = BlockDataset(
                train_data, d, args.batch_size, args.bptt, gpu=args.gpu)
            del train_data
            test = BlockDataset(
                load_lines(os.path.join(args.path, 'test.txt'), proc),
                d, args.batch_size, args.bptt, gpu=args.gpu, evaluation=True)
            if os.path.isfile(os.path.join(args.path, 'valid.txt')):
Example #17
0
        inps, lengths = [], []
        num_batches = len(corpus) // batch_size
        prev = 0
        for b in range(num_batches):
            to = min((b + 1) * batch_size, len(corpus) - 1)
            inp, length = pad_sequential_batch(corpus[prev:to], d.get_pad(),
                                               True, False)
            inps.append(torch.tensor(inp))
            lengths.append(length)
            prev = to

        return inps, lengths

    char_text = [generate_sent() for _ in range(100)]
    word_text = [s.split() for s in char_text]
    char_d = Dict(pad_token=PAD).fit(char_text)
    word_d = Dict(pad_token=PAD).fit(word_text)

    char_inps, char_lengths = create_batches(10, char_text, char_d)
    word_inps, word_lengths = create_batches(10, word_text, word_d)
    n_char_inps = sum(sum(l) for l in char_lengths)
    n_word_inps = sum(sum(l) for l in word_lengths)

    def make_word_embedding_runner(embedding):
        def runner():
            for word_inp in word_inps:
                embedding(word_inp)

        return runner

    def make_char_embedding_runner(embedding):
Example #18
0
def shingle_dataset(args, vocab_dict=None, focus_size=None, right_size=None):
    if focus_size:
        args.focus_size = focus_size
    if right_size:
        args.right_size = right_size

    # load the data:
    if args.task == 'sentences':
        dataset = list(
            SentenceCouples(args.input,
                            max_items=args.max_items,
                            tokenize=args.tokenize,
                            level=args.level))
        print(f'* loaded {len(dataset)} sentences')
    elif args.task == 'snippets':
        dataset = list(
            SnippetCouples(args.input,
                           focus_size=args.focus_size,
                           right_size=args.right_size,
                           max_items=args.max_items))
        print(f'* loaded {len(dataset)} snippets')
    else:
        raise ValueError("`Task` should be one of ('sentences', 'snippets')")

    # random shuffle:
    if args.shuffle:
        print('* shuffling batches...')
        random.seed(args.rnd_seed)
        random.shuffle(dataset)

    for c in dataset[:10]:
        print('\t'.join(' '.join(s[:10]) for s in c))

    if vocab_dict is None:
        vocab_dict = Dict(pad_token=u.PAD,
                          bos_token=u.BOS,
                          eos_token=u.EOS,
                          min_freq=args.min_item_freq,
                          sequential=True,
                          force_unk=True,
                          max_size=args.max_vocab_size)

    focus, right = zip(*dataset)
    del dataset
    if not vocab_dict.fitted:
        vocab_dict.fit(
            focus, right
        )  # sometimes inefficient? # do a partial fit in the triple store?

    train, valid = PairedDataset(src=(focus, ),
                                 trg=(right, ),
                                 d={
                                     'src': (vocab_dict, ),
                                     'trg': (vocab_dict, )
                                 },
                                 batch_size=args.batch_size,
                                 gpu=args.gpu,
                                 align_right=args.reverse,
                                 fitted=False).splits(sort_by='src',
                                                      dev=args.dev,
                                                      test=None,
                                                      sort=True)

    return train, valid, vocab_dict
Example #19
0
    parser.add_argument('--bptt', default=50, type=int)
    parser.add_argument('--epochs', default=10, type=int)
    parser.add_argument('--optim', default='Adam')
    parser.add_argument('--lr', default=0.001, type=float)
    parser.add_argument('--hooks_per_epoch', default=1, type=int)
    parser.add_argument('--checkpoints', default=20, type=int)
    parser.add_argument('--gpu', action='store_true')
    args = parser.parse_args()

    if args.load_dataset:
        dataset = BlockDataset.from_disk(args.dataset_path)
        dataset.set_batch_size(args.batch_size), dataset.set_gpu(args.gpu)
    else:
        words, pos = zip(*load_penn3(args.path, swbd=False))
        word_dict = Dict(eos_token=u.EOS,
                         bos_token=u.BOS,
                         force_unk=True,
                         max_size=100000)
        pos_dict = Dict(eos_token=u.EOS, bos_token=u.BOS, force_unk=False)
        word_dict.fit(words), pos_dict.fit(pos)
        dataset = BlockDataset((pos, words), (pos_dict, word_dict),
                               args.batch_size, args.bptt)
        if args.save_dataset and not os.path.isfile(args.dataset_path):
            dataset.to_disk(args.dataset_path)
    train, valid = dataset.splits(test=None)

    pos_dict, word_dict = train.d

    m = DoubleRNNPOSAwareLM(
        (len(pos_dict.vocab), len(word_dict.vocab)),  # vocabs
        (args.pos_emb_dim, args.word_emb_dim),
        (args.pos_hid_dim, args.word_hid_dim),