Esempio n. 1
0
 def setUp(self):
     self.corpus = [lorem.sentence().split() for _ in range(100)]
     self.tagged_corpus = [[fake_tags(w) for w in s] for s in self.corpus]
     self.tag1_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus]
     self.tag2_corpus = [[tup[2] for tup in s] for s in self.tagged_corpus]
     # dicts
     self.seq_d = Dict(eos_token=utils.EOS,
                       bos_token=utils.BOS,
                       force_unk=True,
                       sequential=True)
     self.seq_d.fit(self.corpus)
     self.tag1_d = Dict(eos_token=utils.EOS,
                        bos_token=utils.BOS,
                        force_unk=True,
                        sequential=True)
     self.tag1_d.fit(self.tag1_corpus)
     self.tag2_d = Dict(eos_token=utils.EOS,
                        bos_token=utils.BOS,
                        force_unk=True,
                        sequential=True)
     self.tag2_d.fit(self.tag2_corpus)
     # props
     self.batch_size = 10
     self.bptt = 5
     # datasets
     self.simple_dataset = BlockDataset(self.corpus, self.seq_d,
                                        self.batch_size, self.bptt)
     words, tags1, tags2 = [], [], []
     for s in self.tagged_corpus:
         words.append([tup[0] for tup in s])
         tags1.append([tup[1] for tup in s])
         tags2.append([tup[2] for tup in s])
     self.multi_dataset = BlockDataset(
         (words, tags1, tags2), (self.seq_d, self.tag1_d, self.tag2_d),
         self.batch_size, self.bptt)
Esempio n. 2
0
def load_twisty_dataset(src, trg, batch_size, max_size=100000, min_freq=5,
                        gpu=False, shuffle=True, **kwargs):
    """
    Wrapper function for twisty with sensible, overwritable defaults
    """
    tweets_dict = Dict(pad_token=u.PAD, eos_token=u.EOS,
                       bos_token=u.BOS, max_size=max_size, min_freq=min_freq)
    labels_dict = Dict(sequential=False, force_unk=False)
    tweets_dict.fit(src)
    labels_dict.fit(trg)
    d = {'src': tweets_dict, 'trg': labels_dict}
    splits = PairedDataset(src, trg, d, batch_size, gpu=gpu).splits(
        shuffle=shuffle, **kwargs)
    return splits
Esempio n. 3
0
 def setUp(self):
     self.corpus = [lorem.sentence().split() for _ in range(1000)]
     self.path = '/tmp/lorem.test.txt'
     with open(self.path, 'w') as f:
         for s in self.corpus:
             f.write(' '.join(s) + '\n')
     self.d = Dict(force_unk=True, sequential=True).fit(self.corpus)
Esempio n. 4
0
 def setUp(self):
     self.sents = []
     for _ in range(5000):
         sent = lorem.sentence().split()
         if sent not in self.sents:
             # avoid duplicates since `test_pairing` relies on sorting
             self.sents.append(sent)
     props = [0.1, 0.4, 0.3, 0.2]
     self.labels = np.random.multinomial(1, props,
                                         (len(self.sents))).argmax(1)
     d = Dict(pad_token='<PAD>').fit(self.sents)
     ld = Dict(sequential=False).fit(self.labels)
     self.dataset = PairedDataset(self.sents,
                                  self.labels, {
                                      'src': d,
                                      'trg': ld
                                  },
                                  batch_size=10)
Esempio n. 5
0
 def setUp(self):
     self.corpus = [lorem.sentence().split() for _ in range(100)]
     self.seq_vocab = Counter(w for s in self.corpus for w in s)
     self.seq_d = Dict(eos_token=utils.EOS,
                       bos_token=utils.BOS,
                       force_unk=True,
                       sequential=True)
     self.seq_d.fit(self.corpus)
     self.seq_transformed = list(self.seq_d.transform(self.corpus))
Esempio n. 6
0
 def setUp(self):
     self.corpus = [lorem.sentence().split() for _ in range(100)]
     self.tagged_corpus = [[fake_tags(w) for w in s] for s in self.corpus]
     self.tag1_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus]
     self.tag2_corpus = [[tup[1] for tup in s] for s in self.tagged_corpus]
     self.seq_d = Dict(eos_token=utils.EOS,
                       bos_token=utils.BOS,
                       force_unk=True,
                       sequential=True)
     self.seq_d.fit(self.corpus)
     self.tag1_d = Dict(eos_token=utils.EOS,
                        bos_token=utils.BOS,
                        force_unk=True,
                        sequential=True)
     self.tag1_d.fit(self.tag1_corpus)
     self.tag2_d = Dict(eos_token=utils.EOS,
                        bos_token=utils.BOS,
                        force_unk=True,
                        sequential=True)
     self.tag2_d.fit(self.tag2_corpus)
Esempio n. 7
0
def load_split_data(path, batch_size, max_size, min_freq, max_len, gpu,
                    processor):
    """
    Load corpus that is already splitted in 'train.txt', 'valid.txt', 'test.txt'
    """
    train_data = load_lines(os.path.join(path, 'train.txt'),
                            max_len=max_len,
                            processor=processor)
    valid_data = load_lines(os.path.join(path, 'valid.txt'),
                            max_len=max_len,
                            processor=processor)
    test_data = load_lines(os.path.join(path, 'test.txt'),
                           max_len=max_len,
                           processor=processor)

    d = Dict(pad_token=u.PAD,
             eos_token=u.EOS,
             bos_token=u.BOS,
             max_size=max_size,
             min_freq=min_freq,
             force_unk=True)
    d.fit(train_data, valid_data)

    train = PairedDataset(train_data, None, {'src': d}, batch_size, gpu=gpu)
    valid = PairedDataset(valid_data,
                          None, {'src': d},
                          batch_size,
                          gpu=gpu,
                          evaluation=True)
    test = PairedDataset(test_data,
                         None, {'src': d},
                         batch_size,
                         gpu=gpu,
                         evaluation=True)

    return train.sort_(), valid.sort_(), test.sort_()
Esempio n. 8
0
    batch_size = args.batch_size
    sample_fn = getattr(d, args.sample_fn)

    if args.path is not None:
        with open(args.path, 'rb+') as f:
            dataset = PairedDataset.from_disk(f)
        dataset.set_batch_size(args.batch_size)
        dataset.set_device(args.device)
        train, valid = dataset.splits(sort_by='src', dev=args.dev, test=None)
        src_dict = dataset.dicts['src']
    else:
        str_generator = d.generate_set(size, vocab, args.min_len, args.max_len,
                                       sample_fn)
        src, trg = zip(*str_generator)
        src, trg = list(map(list, src)), list(map(list, trg))
        src_dict = Dict(pad_token=u.PAD, eos_token=u.EOS, bos_token=u.BOS)
        src_dict.fit(src, trg)
        trg_dict = src_dict
        if args.reverse:
            trg_dict = copy.deepcopy(src_dict)
            trg_dict.align_right = True
        train, valid = PairedDataset(
            src,
            trg,
            {
                'src': src_dict,
                'trg': trg_dict
            },
            batch_size=args.batch_size,
            device=args.device,
        ).splits(dev=args.dev, test=None, sort=True)
Esempio n. 9
0
if __name__ == '__main__':
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--path', required=True)
    parser.add_argument('--output', help='prefix for the stored dataset', required=True)
    parser.add_argument('--max_size', type=int, default=100000)
    parser.add_argument('--min_freq', default=1, type=int)
    parser.add_argument('--lower', action='store_true')
    parser.add_argument('--num', action='store_true')
    parser.add_argument('--level', default='char')
    args = parser.parse_args()

    processor = text_processor(
        lower=args.lower, num=args.num, level=args.level)
    d = Dict(max_size=args.max_size, min_freq=args.min_freq,
             eos_token=u.EOS, force_unk=True)

    trainpath = os.path.join(args.path, 'train.txt')
    testpath = os.path.join(args.path, 'test.txt')
    outputformat = (args.output + ".{}.npz").format

    if os.path.isfile(outputformat("train")):
        raise ValueError("Output train file already exists")
    if os.path.isfile(outputformat("test")):
        raise ValueError("Output test file already exists")

    print("Fitting dictionary")
    d.fit(load_lines(trainpath, processor=processor),
          load_lines(testpath, processor=processor))
    u.save_model(d, args.output + '.dict')
Esempio n. 10
0
    parser.add_argument('--patience', default=2, type=int)
    parser.add_argument('--epochs', type=int, default=25)
    parser.add_argument('--batch_size', type=int, default=50)
    parser.add_argument('--device', default='cpu')
    parser.add_argument('--checkpoint', type=int, default=100)
    parser.add_argument('--hook', type=int, default=1)
    parser.add_argument('--test', action='store_true')
    args = parser.parse_args()

    print("Loading data...")
    train_conds, train = zip(
        *load_sents(args.basedir, 'train', max_lines=args.max_lines))
    train_conds, train = list(train_conds), list(train)
    d = Dict(eos_token=u.EOS,
             bos_token=u.BOS,
             unk_token=u.UNK,
             pad_token=u.PAD,
             max_size=args.max_size,
             force_unk=True).fit(train)
    d2 = copy.deepcopy(d)
    d2.align_right = args.reverse
    conds_d = Dict(sequential=False).fit(train_conds)

    conditional = args.cond_emb > 0

    # AE+GRL+C
    if args.grl and conditional:
        src, trg = (train, train_conds), (train, train_conds)
        dicts = {'src': (d, conds_d), 'trg': (d2, conds_d)}
    # AE+GRL
    elif args.grl:
        src, trg = (train, train_conds), train
Esempio n. 11
0
        grl_loss = []
        for cond, grl in zip(conds, self.grls):
            cond_out = F.log_softmax(grad_reverse(grl(out)), 1)
            grl_loss.append(F.nll_loss(cond_out, cond, size_average=True))

        if not test:
            (sum(grl_loss) / len(self.grls)).backward(retain_graph=True)

        return [l.data[0] for l in grl_loss]

    GRLEncoder = type(
        'GRL{}'.format(EncoderBaseClass.__name__), (EncoderBaseClass, ), {
            '__init__': __init__,
            'loss': loss,
            'conditional': property(lambda self: True)
        })

    return GRLEncoder


GRLRNNEncoder = GRLWrapper(RNNEncoder)
GRLCNNEncoder = GRLWrapper(CNNEncoder)

if __name__ == '__main__':
    import os
    from seqmod.misc import Dict
    from seqmod.modules.embedding import Embedding
    text = open(os.path.realpath(__file__)).read().split()
    emb = Embedding.from_dict(Dict().fit(text), 100)
    GRLRNNEncoder([10], [10], emb, 10, 1, 'LSTM', summary='mean')
Esempio n. 12
0
    parser.add_argument('--device', default='cpu')
    parser.add_argument('--checkpoint', default=1000, type=int)
    parser.add_argument('--hook', default=1, type=int)
    parser.add_argument('--test', action='store_true', help="Don't save")
    args = parser.parse_args()

    print("Loading data...")
    src, src_conds, trg, trg_conds = \
        zip(*load_pairs(args.basedir, 'train', tt=args.tt, max_lines=args.max_lines))
    src, src_conds = list(src), list(src_conds)
    trg, trg_conds = list(trg), list(trg_conds)

    d = Dict(
        eos_token=u.EOS,
        bos_token=u.BOS,
        unk_token=u.UNK,
        pad_token=u.PAD,
        max_size=args.max_size,
        force_unk=True,
    ).fit(src, trg)
    d2 = copy.deepcopy(d)
    d2.align_right = args.reverse
    conds_d = Dict(sequential=False).fit(src_conds, trg_conds)

    # S2S+GRL
    if args.grl:
        if args.tt:
            raise ValueError("GRL+TT doesn't quite make sense")
        src, trg = (src, src_conds), trg
        dicts = {'src': (d, conds_d), 'trg': d2}
    # S2S or TT
    else: