Esempio n. 1
0
    def __init__(self,
                 phase,
                 batch_size,
                 max_len=140,
                 batches_per_epoch=1000,
                 pad=True):
        self.phase = phase
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.max_len = max_len
        self.vocab = Vocabulary()
        self.vocab.add('<pad>')
        self.vocab.add('<unk>')
        self.vocab.add('<end>')
        for i in range(256):
            ch = chr(i)
            self.vocab.add(ch)
        self.n_classes = len(self.vocab)
        self.pad = pad

        self.tweets = []
        with open("data/tweets.txt") as f:
            while True:
                s = f.readline()
                if s == "":
                    break
                s = s.strip().split(" ")
                for i in range(len(s)):
                    if s[i].startswith('http://'):
                        s[i] = "url"
                    if s[i].startswith('https://'):
                        s[i] = "url"
                    if s[i].startswith("@"):
                        s[i] = "@userid"
                    #if s[i].startswith("#"):
                    #    s[i] = "#hashtag"

                s = ''.join([s1 + " " for s1 in s]).strip()
                tweet = s
                if len(tweet) <= max_len - 1:
                    self.tweets.append(tweet)
                if len(self.tweets) >= 1000000:
                    break

        valid_size = 10000
        if self.phase == 'train':
            self.tweets = self.tweets[valid_size:]
        else:
            self.tweets = self.tweets[:valid_size]

        print("%s: %d tweets, max %d chars" %
              (phase, len(self.tweets), max_len))

        x = self.make_batch()
        self.shared_x = theano.shared(x)

        self.index = T.iscalar()
Esempio n. 2
0
import pickle
import sys
from nn.utils import Vocabulary

session = sys.argv[1]

vocab_file = open("session/%s/vocab.pkl" % session)
vocab = Vocabulary()

vocab = pickle.load(vocab_file)
vocab_file.close

print(vocab.word_to_index)
Esempio n. 3
0
class TwitterReconstructionDatabase(object):
    def __init__(self,
                 phase,
                 batch_size,
                 max_len=140,
                 batches_per_epoch=1000,
                 pad=True):
        self.phase = phase
        self.batch_size = batch_size
        self.batches_per_epoch = batches_per_epoch
        self.max_len = max_len
        self.vocab = Vocabulary()
        self.vocab.add('<pad>')
        self.vocab.add('<unk>')
        self.vocab.add('<end>')
        for i in range(256):
            ch = chr(i)
            self.vocab.add(ch)
        self.n_classes = len(self.vocab)
        self.pad = pad

        self.tweets = []
        with open("data/tweets.txt") as f:
            while True:
                s = f.readline()
                if s == "":
                    break
                s = s.strip().split(" ")
                for i in range(len(s)):
                    if s[i].startswith('http://'):
                        s[i] = "url"
                    if s[i].startswith('https://'):
                        s[i] = "url"
                    if s[i].startswith("@"):
                        s[i] = "@userid"
                    #if s[i].startswith("#"):
                    #    s[i] = "#hashtag"

                s = ''.join([s1 + " " for s1 in s]).strip()
                tweet = s
                if len(tweet) <= max_len - 1:
                    self.tweets.append(tweet)
                if len(self.tweets) >= 1000000:
                    break

        valid_size = 10000
        if self.phase == 'train':
            self.tweets = self.tweets[valid_size:]
        else:
            self.tweets = self.tweets[:valid_size]

        print("%s: %d tweets, max %d chars" %
              (phase, len(self.tweets), max_len))

        x = self.make_batch()
        self.shared_x = theano.shared(x)

        self.index = T.iscalar()

    def to_inputs(self, tweet):
        chars = [self.vocab.by_word(ch, oov_word='<unk>') for ch in tweet]
        chars.append(self.vocab.by_word('<end>'))
        for i in range(self.max_len - len(tweet) - 1):
            chars.append(self.vocab.by_word('<pad>'))
        return numpy.asarray(chars)

    def make_batch(self):
        batch = numpy.zeros((self.max_len, self.batch_size))

        if self.pad:
            for i in range(self.batch_size):
                idx = numpy.random.randint(len(self.tweets))
                batch[:, i] = self.to_inputs(self.tweets[idx])
        else:
            idx = numpy.random.randint(len(self.tweets))
            max_len = len(self.tweets[idx])
            target_len = len(self.tweets[idx])
            batch[:, 0] = self.to_inputs(self.tweets[idx])
            i = 1
            while i < self.batch_size:
                idx = numpy.random.randint(len(self.tweets))
                if abs(len(self.tweets[idx]) - target_len) > 3:
                    continue
                batch[:, i] = self.to_inputs(self.tweets[idx])
                max_len = max(max_len, len(self.tweets[idx]) + 1)
                i += 1
            batch = batch[0:max_len]

        return batch.astype('int32')

    def total_batches(self):
        return self.batches_per_epoch

    def givens(self, x, t):
        return {
            x:
            self.shared_x[:, self.index * self.batch_size:(self.index + 1) *
                          self.batch_size],
        }

    def indices(self):
        for i in range(self.total_batches()):
            x = self.make_batch()
            self.shared_x.set_value(x)
            yield 0
from nn.layers import OneHot
from nn.utils import Vocabulary
import nn.utils

from lm_vae import Sampler
from lm_vae_sample import LNLSTMStep
from textproject_vae_charlevel import make_model

from wordfilter import Wordfilter
wordfilter = Wordfilter()

t1 = time.time()

session = "sp15_trial"

vocab = Vocabulary()

if os.path.exists("session/%s/vocab.pkl" % session):
    with open("session/%s/vocab.pkl" % session) as vocab_file:
        vocab = pickle.load(vocab_file)
        print("Loaded vocab with %i chars:" % len(vocab))
        #print(vocab.index_to_word)
else:
    print("Using default 256-char vocab")
    # old-school
    vocab.add("<pad>")
    vocab.add("<unk>")
    vocab.add("<end>")
    for i in xrange(256):
        ch = chr(i)
        vocab.add(ch)
Esempio n. 5
0
@app.route('/get_z', methods=['GET'])
def get_z():
    #json = request.get_json()
    s1 = request.args.get('s1')
    print(s1)
    z, text = serve_get_z(s1)
    return jsonify({"z": z, "text": text})

# The actual work

t1 = time.time()

print("It begins")

vocab = Vocabulary()

if os.path.exists("session/%s/vocab.pkl" % session):
    with open("session/%s/vocab.pkl" % session) as vocab_file:
       vocab = pickle.load(vocab_file)
       print("Loaded vocab with %i chars:" % len(vocab))
       print(vocab.index_to_word)
else:
    print("Using default 256-char vocab")
    # Should probably extract this into a little shared module with textproject_reconstruction_database
    # Maybe later
    vocab.add('<pad>')
    vocab.add('<unk>')
    vocab.add('<end>')
    for i in xrange(32, 128):
        ch = chr(i)
Esempio n. 6
0
    def __init__(self,
                 dataset,
                 phase,
                 batch_size,
                 max_len=140,
                 pad=True,
                 sp_model=None):
        self.phase = phase
        self.batch_size = batch_size
        self.max_len = max_len
        self.sp_model = sp_model

        self.vocab = Vocabulary()

        self.using_sp = (self.sp_model != None) and (len(self.sp_model) > 0)
        if self.using_sp:
            print("Using sentencepiece")
            import sentencepiece as spm  # https://github.com/google/sentencepiece

            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(self.sp_model)
            sp_model_size = self.sp.GetPieceSize()
            print("Loaded SP model with", sp_model_size, "tokens")
            self.vocab.add('<pad>')
            self.vocab.add('<unk>')
            self.vocab.add('<end>')

            for i in xrange(sp_model_size):
                self.vocab.add(self.sp.IdToPiece(i))

        else:
            print("Using default fixed vocab")
            self.vocab.add('<pad>')
            self.vocab.add('<unk>')
            self.vocab.add('<end>')
            for i in xrange(32, 128):
                ch = chr(i)
                self.vocab.add(ch)

        self.n_classes = len(self.vocab)
        self.pad = pad

        self.sentences = []

        # First, check full path; if that doesn't work, make a guess that it's in data/
        if os.path.exists(dataset):
            dataset_path = dataset
        elif os.path.exists("data/%s" % dataset):
            dataset_path = "data/%s" % dataset
        else:
            raise Exception("Can't find any dataset named %s!" % dataset)

        with open(dataset_path) as f:
            while True:
                s = f.readline()
                if s == "":
                    break
                if self.using_sp:
                    chars = self.sp.EncodeAsIds(s)
                    if len(chars) <= max_len - 1:
                        self.sentences.append(s)
                elif len(s) <= max_len - 1:
                    self.sentences.append(s)
                #if len(self.sentences) >= 1000000:
                #break

        self.shuffle_sentences()

        valid_size = int(len(self.sentences) * 0.1)

        if self.phase == 'train':
            self.sentences = self.sentences[valid_size:]
        else:
            self.sentences = self.sentences[:valid_size]

        print "%s data: %d sentences, max %d chars" % (
            phase, len(self.sentences), max_len)

        self.batches_per_epoch = int(len(self.sentences) / batch_size)

        # per the original textvae code, let's just keep this lean
        if self.phase == 'valid':
            print "Reducing valid set to 100 batches"
            self.batches_per_epoch = min(self.batches_per_epoch, 100)

        x = self.make_batch()

        self.shared_x = theano.shared(x)

        self.index = T.iscalar()
Esempio n. 7
0
class TextProjectReconstructionDatabase(object):
    def __init__(self,
                 dataset,
                 phase,
                 batch_size,
                 max_len=140,
                 pad=True,
                 sp_model=None):
        self.phase = phase
        self.batch_size = batch_size
        self.max_len = max_len
        self.sp_model = sp_model

        self.vocab = Vocabulary()

        self.using_sp = (self.sp_model != None) and (len(self.sp_model) > 0)
        if self.using_sp:
            print("Using sentencepiece")
            import sentencepiece as spm  # https://github.com/google/sentencepiece

            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(self.sp_model)
            sp_model_size = self.sp.GetPieceSize()
            print("Loaded SP model with", sp_model_size, "tokens")
            self.vocab.add('<pad>')
            self.vocab.add('<unk>')
            self.vocab.add('<end>')

            for i in xrange(sp_model_size):
                self.vocab.add(self.sp.IdToPiece(i))

        else:
            print("Using default fixed vocab")
            self.vocab.add('<pad>')
            self.vocab.add('<unk>')
            self.vocab.add('<end>')
            for i in xrange(32, 128):
                ch = chr(i)
                self.vocab.add(ch)

        self.n_classes = len(self.vocab)
        self.pad = pad

        self.sentences = []

        # First, check full path; if that doesn't work, make a guess that it's in data/
        if os.path.exists(dataset):
            dataset_path = dataset
        elif os.path.exists("data/%s" % dataset):
            dataset_path = "data/%s" % dataset
        else:
            raise Exception("Can't find any dataset named %s!" % dataset)

        with open(dataset_path) as f:
            while True:
                s = f.readline()
                if s == "":
                    break
                if self.using_sp:
                    chars = self.sp.EncodeAsIds(s)
                    if len(chars) <= max_len - 1:
                        self.sentences.append(s)
                elif len(s) <= max_len - 1:
                    self.sentences.append(s)
                #if len(self.sentences) >= 1000000:
                #break

        self.shuffle_sentences()

        valid_size = int(len(self.sentences) * 0.1)

        if self.phase == 'train':
            self.sentences = self.sentences[valid_size:]
        else:
            self.sentences = self.sentences[:valid_size]

        print "%s data: %d sentences, max %d chars" % (
            phase, len(self.sentences), max_len)

        self.batches_per_epoch = int(len(self.sentences) / batch_size)

        # per the original textvae code, let's just keep this lean
        if self.phase == 'valid':
            print "Reducing valid set to 100 batches"
            self.batches_per_epoch = min(self.batches_per_epoch, 100)

        x = self.make_batch()

        self.shared_x = theano.shared(x)

        self.index = T.iscalar()

    def shuffle_sentences(self):
        # this all might be horribly inefficient but whatever
        t = time.time()
        print("Shuffling %s sentences..." % len(self.sentences))
        numpy.random.shuffle(self.sentences)
        print "...done. Took %s seconds" % round(time.time() - t)

    def to_inputs(self, sentence):
        sentence = sentence.replace("\n", "")
        if self.using_sp:
            #print(self.sp.EncodeAsPieces(sentence))
            chars = self.sp.EncodeAsIds(sentence)
        else:
            chars = [
                self.vocab.by_word(ch, oov_word='<unk>') for ch in sentence
            ]
        chars.append(self.vocab.by_word('<end>'))
        for i in xrange(self.max_len - len(chars)):
            chars.append(self.vocab.by_word('<pad>'))
        return numpy.asarray(chars)

    # The original code drew random samples but didn't keep track of which had already been drawn. This seems not ideal to me so I am rewriting to make minibatches draw samples *without* replacement.
    # EDIT: Now back to doing it the original way, because speed!

    def make_batch(self):
        batch = numpy.zeros((self.max_len, self.batch_size))

        if self.pad:
            for i in xrange(self.batch_size):
                idx = numpy.random.randint(len(self.sentences))
                # here we pop the sentence out of the array entirely
                batch[:, i] = self.to_inputs(self.sentences[idx])
        else:
            idx = numpy.random.randint(len(self.sentences))
            max_len = len(self.sentences[idx])
            target_len = len(self.sentences[idx])
            batch[:, 0] = self.to_inputs(self.sentences[idx])
            i = 1
            while i < self.batch_size:
                idx = numpy.random.randint(len(self.sentences))
                if abs(len(self.sentences[idx]) - target_len) > 3:
                    continue
                batch[:, i] = self.to_inputs(self.sentences[idx])
                max_len = max(max_len, len(self.sentences[idx]) + 1)
                i += 1
            batch = batch[0:max_len]

        return batch.astype('int32')

    def givens(self, x, t):
        return {
            x:
            self.shared_x[:, self.index * self.batch_size:(self.index + 1) *
                          self.batch_size],
        }

    def total_batches(self):
        return self.batches_per_epoch

    def indices(self):
        for i in xrange(self.batches_per_epoch):
            x = self.make_batch()
            self.shared_x.set_value(x)
            yield 0
def main(z, sample_size, p, lstm_size, mode, alpha):
    vocab = Vocabulary()
    vocab.add('<pad>')
    vocab.add('<unk>')
    vocab.add('<end>')
    for i in range(256):
        ch = chr(i)
        vocab.add(ch)
    n_classes = len(vocab)

    model = make_model(z, sample_size, p, n_classes, lstm_size, alpha)
    name = "twittervae.charlevel.z_%d.len_%d.p_%.2f.lstmsz_%d.alpha_%.2f" % (
        z, sample_size, p, lstm_size, alpha)
    model.load("exp/%s/model.flt" % name)
    model.set_phase(train=False)

    start_word = n_classes

    if mode == 'vary':
        n = 7
        sampled = numpy.random.normal(0, 1, (1, z))
        sampled = numpy.repeat(sampled, n * z, axis=0)
        for dim in range(z):
            eps = 0.01
            x = numpy.linspace(eps, 1 - eps, num=n)
            x = norm.ppf(x)
            sampled[dim * n:(dim + 1) * n, dim] = x
        n *= z
    elif mode == 'interpolatereal':
        valid_db = TwitterReconstructionDatabase("valid",
                                                 50,
                                                 batches_per_epoch=100,
                                                 max_len=sample_size)
        s1 = numpy.random.randint(0, len(valid_db.tweets))
        s2 = numpy.random.randint(0, len(valid_db.tweets))
        encoder = model.layers[0].branches[0]
        sampler = encoder[-1]
        assert isinstance(sampler, Sampler)
        ins = numpy.zeros((sample_size, 2))
        ins[:, 0] = valid_db.to_inputs(valid_db.tweets[s1])
        ins[:, 1] = valid_db.to_inputs(valid_db.tweets[s2])
        x = T.imatrix()
        z = encoder(x)
        mu = sampler.mu
        f = theano.function([x], mu)
        z = f(ins.astype('int32'))
        s1_z = z[0]
        s2_z = z[1]
        n = 7
        s1_z = numpy.repeat(s1_z[None, :], n, axis=0)
        s2_z = numpy.repeat(s2_z[None, :], n, axis=0)
        steps = numpy.linspace(0, 1, n)[:, None]
        sampled = s1_z * (1 - steps) + s2_z * steps
    elif mode == 'arithm':
        valid_db = TwitterReconstructionDatabase("valid",
                                                 50,
                                                 batches_per_epoch=100,
                                                 max_len=sample_size)
        s1 = numpy.random.randint(0, len(valid_db.tweets))
        s2 = numpy.random.randint(0, len(valid_db.tweets))
        s3 = numpy.random.randint(0, len(valid_db.tweets))
        print(valid_db.tweets[s1])
        print(valid_db.tweets[s2])
        print(valid_db.tweets[s3])
        encoder = model.layers[0].branches[0]
        sampler = encoder[-1]
        assert isinstance(sampler, Sampler)
        ins = numpy.zeros((sample_size, 3))
        ins[:, 0] = valid_db.to_inputs(valid_db.tweets[s1])
        ins[:, 1] = valid_db.to_inputs(valid_db.tweets[s2])
        ins[:, 2] = valid_db.to_inputs(valid_db.tweets[s3])
        x = T.imatrix()
        z = encoder(x)
        mu = sampler.mu
        f = theano.function([x], mu)
        z = f(ins.astype('int32'))
        s1_z = z[0]
        s2_z = z[1]
        s3_z = z[1]
        n = 1
        sampled = s1_z - s2_z + s3_z
        sampled = sampled[None, :]
    elif mode == 'interpolate':
        z = numpy.random.normal(0, 1, (2, z))
        s1_z = z[0]
        s2_z = z[1]
        n = 7
        s1_z = numpy.repeat(s1_z[None, :], n, axis=0)
        s2_z = numpy.repeat(s2_z[None, :], n, axis=0)
        steps = numpy.linspace(0, 1, n)[:, None]
        sampled = s1_z * (1 - steps) + s2_z * steps
    else:
        n = 100
        sampled = numpy.random.normal(0, 1, (n, z))

    start_words = numpy.ones(n) * start_word
    start_words = theano.shared(start_words.astype('int32'))
    sampled = theano.shared(sampled.astype(theano.config.floatX))

    decoder_from_z = model.layers[1].branches[0]
    from_z = decoder_from_z(sampled)

    layers = model.layers[-3:]
    layers[0] = LNLSTMStep(layers[0])
    step = Sequential(layers)
    embed = model.layers[1].branches[1].layers[-1]

    words = start_words
    generated = []
    for i in range(sample_size):
        ins = T.concatenate([from_z[i], embed(words)], axis=1)
        pred = step(ins)
        words = T.argmax(pred, axis=1)
        generated.append(words[None, :])

    generated = T.concatenate(generated, axis=0)
    import time
    t = time.time()
    print("compiling...", end=' ')
    f = theano.function([], outputs=generated)
    print("done, took %f secs" % (time.time() - t))
    w = f()

    results = []

    pad = vocab.by_word("<pad>")
    end = vocab.by_word("<end>")
    for i in range(w.shape[1]):
        s = []
        for idx in w[:, i]:
            if idx == end:
                break
            if idx == pad:
                break
            s.append(vocab.by_index(idx))
        r = ''.join(s)
        if mode == "vary":
            if i % n == 0:
                print("dimension %d" % (i / n))
        print(r.strip())
        results.append(r)