Exemple #1
0
    def __init__(self, texts):

        # Create the training data
        xy = self.preprocess_text(texts);
        self.X_train = xy['x'];
        self.y_train = xy['y'];

        self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)

        self.train_with_sgd()
def train(X_train,y_train,vocabulary_size,hiddenDim,modelFiles):
	model = RNNTheano(vocabulary_size, hidden_dim=hiddenDim)
	t1 = time.time()
	model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
	t2 = time.time()
	print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

	if modelFiles != None:
	    load_model_parameters_theano(modelFiles, model)

	train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000):
    model = RNNTheano(vocab_size, hidden_dim=hidden_dim)
    t1 = time.time()
    model.sgd_step(x_train[10], y_train[10], learning_rate)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    if load_model_file != "":
        print "loading model: %s" % load_model_file
        load_model_parameters_theano(load_model_file, model)

    train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
Exemple #4
0
 def import_model(model_location, csv_file=_CSV_FILE):
     m = re.findall('\d+', model_location)
     vocab_size = int(m[1])
     hidden_d = int(m[0])
     print "Vocab size= %d, hidden dimensions= %d" % (vocab_size, hidden_d)
     model2 = RNNTheano(vocab_size, hidden_dim=hidden_d)
     load_model_parameters_theano(model_location, model2)
     _, idx_to_word, word_to_idx = load_data_set(csv_file, vocab_size)
     return model2, idx_to_word, word_to_idx
def generate_examples(model_name, index_to_word, word_to_index, vocab_size=8000, hidden_dim=100, num_sentences=10, sentences_min_length=4):

    model = RNNTheano(vocab_size, hidden_dim)
    load_model_parameters_theano(model_name, model)
    sentences = []

    for i in range(num_sentences):
        sent = []
        while len(sent) < sentences_min_length:
            sent = generate_sentence(model, index_to_word, word_to_index)
        print " ".join(sent)

    while len(sentences) < num_sentences:
        sent = generate_sentence(model, index_to_word, word_to_index)
        if len(sent) >= sentences_min_length: sentences.append(sent)
def train_theano():
    model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM)
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE)

    if Config._MODEL_FILE != None:
        print "start saving model..."
        save_model_parameters_theano(Config._MODEL_FILE, model)
        print "model saved!"
Exemple #7
0
# creating vocabulary
if not os.path.isfile(vocab_file):
    vocab = construct_vocabulary(train_file)
    write_vocabulary(vocab,vocab_file)

# read the vocab
index_to_word, word_to_index = read_vocabulary(vocab_file, 8000)
# adding special symbols
index_to_word.append(sentence_end_token)
index_to_word.append(sentence_start_token)
word_to_index[sentence_start_token] = VOCAB_SIZE+1
word_to_index[sentence_end_token] = VOCAB_SIZE+2

if THEANO:
    rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = 50)
else:
    rnn = RNN(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, VOCAB_SIZE+SPEC_SYMBOLS_COUNT,hidden_dim = 100)
# generate sentences
print("training the model")
loss = [rnn.total_loss(itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES))]
for e in range(EPOCHS):
    i = 0
    print("--- Epoch "+str(e+1)+" ---")
    loss.append(rnn.total_loss(itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES)))
    sentences = tokenize_file(word_to_index, train_file)
    for sentence in itertools.islice(sentences, MAX_SENTENCES):
        i+=1
        sentence.insert(0,word_to_index[sentence_start_token])
        y = copy.copy(sentence)
        y.pop(0)
Exemple #8
0
 def train_numpy(self, x_train, y_train, iterations):
     self.model = RNNNumpy(word_dim = self.vocabulary_size,
                           hidden_dim = 100, bptt_truncate = 4)
     self.model.sgd(x_train, y_train, 0.01, iterations)
Exemple #9
0
np.save('data/ixtoword.npy', index_to_word)

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokens])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokens])
feat_path = './data/feats.npy'
feats = np.load(feat_path)
dim_embed = 256
dim_hidden = 256
dim_image = 4096
feats = np.load(feat_path)
feats = feats[:158900]

encode_img_W = np.random.uniform(-0.1, 0.1, (dim_image, dim_hidden))
encode_img_b = np.zeros((dim_hidden))

bv = feats * encoding_img_W + encoding_img_b

model = RNNTheano(len(index_to_word), hidden_dim=dim_hidden)
t1 = time.time()
model.sgd_step(X_train[10], bv[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model, X_train, bv, y_train, nepoch=1000, learning_rate=0.01)
Exemple #10
0
# Replace all words not in our vocabulary with the unknown token
# todo needs cleaner text preprocessing
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [
        w if w in word_to_index else unknown_token for w in sent
    ]

# Create the training data
X_train = numpy.asarray([[word_to_index[w] for w in sent[:-1]]
                         for sent in tokenized_sentences])
y_train = numpy.asarray([[word_to_index[w] for w in sent[1:]]
                         for sent in tokenized_sentences])

######################################################################## construct RNN
print "constructing model..."  # todo try a smarter initialization - wrt vanishing gradients
model = RNNTheano(vocabulary_size, hidden_dim=HIDDEN_DIM)

gradient_check_theano(model,
                      X_train[10],
                      y_train[10],
                      h=0.0000001,
                      error_threshold=0.01)

######################################################################## train

if RETRAIN:
    # run a single step to get a feel for training time
    print "run a single step..."
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], LEARNING_RATE)
    t2 = time.time()
Exemple #11
0
        w if w in word_to_index else unknown_token for w in sent
    ]

print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
# Note that the length of each sentence is different
# X_train - every words of each sentence except for the last one
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
# y_train - every words except for the first one
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

model = RNNTheano(vocabulary_size, hidden_dim=50)
load_model_parameters_theano('./trained-model-theano.npz', model)


def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])


model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
# t1 = time.time()
# model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
# t2 = time.time()
# print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

# train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)


o = model.forward_propagation(X_train[1])
print o
print [index_to_word[x] for x in X_train[1]];
p = 1;
Exemple #13
0
                time, num_examples_seen, epoch, accuracy)

        # For each training example (SGD step)...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1


# Create the training data
x, y = cPickle.load(open('OnlyNPs_codeonly.cPickle', 'rb'))
X_train = np.array(x, dtype='float32')
y_train = np.array(y, dtype='float32')

# Specify model and timing one SGD step
model = RNNTheano(10, 333, hidden_dim=30)
#t1 = time.time()
#model.sgd_step(X_train[10], y_train[10], 0.005)
#t2 = time.time()
#print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

# Train model for n epoches
train_with_sgd(model,
               X_train,
               y_train,
               nepoch=150,
               learning_rate=0.01,
               evaluate_loss_after=1)

# Save model parameters
save_model_parameters_theano('model_parameters_OnlyNPs', model)
Exemple #14
0
# creating vocabulary
if not os.path.isfile(vocab_file):
    vocab = construct_vocabulary(train_file)
    write_vocabulary(vocab, vocab_file)

# read the vocab
index_to_word, word_to_index = read_vocabulary(vocab_file, 8000)
# adding special symbols
index_to_word.append(sentence_end_token)
index_to_word.append(sentence_start_token)
word_to_index[sentence_start_token] = VOCAB_SIZE + 1
word_to_index[sentence_end_token] = VOCAB_SIZE + 2

if THEANO:
    rnn = RNNTheano(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=50)
else:
    rnn = RNN(VOCAB_SIZE + SPEC_SYMBOLS_COUNT,
              VOCAB_SIZE + SPEC_SYMBOLS_COUNT,
              hidden_dim=100)
# generate sentences
print("training the model")
loss = [
    rnn.total_loss(
        itertools.islice(tokenize_file(word_to_index, train_file),
                         MAX_L_SENTENCES))
]
for e in range(EPOCHS):
    i = 0
    print("--- Epoch " + str(e + 1) + " ---")
    loss.append(
Exemple #15
0
import preprocess
from rnn_numpy import RNNNumpy
from rnn_theano import RNNTheano
import numpy as np
import cProfile

X_train, y_train, vocabulary_size = preprocess.create_train_data()
np.random.seed(10)
model = RNNNumpy(vocabulary_size)

np.random.seed(10)
model = RNNNumpy(vocabulary_size)
#cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)")
#print("----------------------------------------------------------------")
np.random.seed(10)
model_theano = RNNTheano(vocabulary_size)
#cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)")
print("----------------------------------------------------------------")
losses_numpy = model.train_with_sgd(X_train[:100],
                                    y_train[:100],
                                    nepoch=5,
                                    evaluate_loss_after=1)
losses_theano = model_theano.train_with_sgd(X_train[:100],
                                            y_train[:100],
                                            nepoch=5,
                                            evaluate_loss_after=1)
num_sentences = 10
senten_min_length = 7

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print " ".join(sent)

## Evaluting on Theano with CPU

import os
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32"

import theano
import theano.tensor as T
from utils import *

from rnn_theano import RNNTheano, gradient_check_theano

np.random.seed(10)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

## Without Theano in fast mode it took 1m 12s. Python alone took 750 ms
np.random.seed(20)
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)
Exemple #17
0
    print "Expected Loss for random predictions: %f" % np.log(model.word_dim)
    print "Actual loss: %f" % model.calculate_loss(X_train[:100],
                                                   y_train[:100])


def test_performance(model, learning_rate):
    print "\ntest performance: " + str(type(model))
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], learning_rate)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)


model_gru = GRUTheano(word_dim=_VOCABULARY_SIZE,
                      hidden_dim=_HIDDEN_DIM,
                      bptt_truncate=-1)
model_theano = RNNTheano(word_dim=_VOCABULARY_SIZE,
                         hidden_dim=_HIDDEN_DIM,
                         bptt_truncate=-1)
model_rnn = RNNNumpy(word_dim=_VOCABULARY_SIZE,
                     hidden_dim=_HIDDEN_DIM,
                     bptt_truncate=-1)

test_performance(model_gru, _LEARNING_RATE)
test_performance(model_theano, _LEARNING_RATE)
test_performance(model_rnn, _LEARNING_RATE)

test_loss(model_gru)
test_loss(model_theano)
test_loss(model_rnn)
Exemple #18
0
for i, sent in enumerate(tokenized_sentences):
  tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
 
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]


# Create the training data
X_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[:-1]])
Y_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[1:]])

print X_train, type(X_train)
print Y_train, type(Y_train)

np.random.seed(10)
model = RNNTheano(vocabulary_size)
# model = RNNNumpy(vocabulary_size)
# o, s = model.forward_propagation(X_train[1])
# print o.shape
# print o
l = [8,9,0,1,2,3,4,5,6,7]
x = np.asarray([np.int32(a) for a in l])
l2 = [3,4,5,9,0,1]
x2 = np.asarray([np.int32(a) for a in l2])
# x = np.asarray([np.int32(a) for a in range(0,10)])
# print x, type(x)
print "input", x, x2
# x[0] = 10
# print x, type(x)
# o = model.forward_propagation(x)
# print "o.shape",(o).shape, o 
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

#Training our Network with Theano and the GPU

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

np.random.seed(10)
model = RNNTheano(vocabulary_size)
model.sgd_step(X_train[10], y_train[10], 0.005)


# Run the model
model = RNNTheano(vocabulary_size, hidden_dim=50)
load_model_parameters_theano('./data/trained-model-theano.npz', model)

def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
Exemple #20
0

# ### Training our Network with Theano and the GPU
#
# I have previously written a [tutorial](http://www.wildml.com/2015/09/speeding-up-your-neural-network-with-theano-and-the-gpu/) on Theano, and since all our logic will stay exactly the same I won't go through optimized code here again. I defined a `RNNTheano` class that replaces the numpy calculations with corresponding calculations in Theano. Just like the rest of this post, [the code is also available Github](https://github.com/dennybritz/rnn-tutorial-rnnlm).

# In[20]:

from rnn_theano import RNNTheano, gradient_check_theano

# In[ ]:

np.random.seed(10)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0, 1, 2, 3], [1, 2, 3, 4])

# In[ ]:

np.random.seed(10)
model = RNNTheano(vocabulary_size)
#ic(u'timeit model.sgd_step(X_train[10], y_train[10], 0.005)')

# This time, one SGD step takes 70ms on my Mac (without GPU) and 23ms on a [g2.2xlarge](https://aws.amazon.com/ec2/instance-types/#g2) Amazon EC2 instance with GPU. That's a 15x improvement over our initial implementation and means we can train our model in hours/days instead of weeks. There are still a vast number of optimizations we could make, but we're good enough for now.
#
# To help you avoid spending days training a model I have pre-trained a Theano model with a hidden layer dimensionality of 50 and a vocabulary size of 8000. I trained it for 50 epochs in about 20 hours. The loss was was still decreasing and training longer would probably have resulted in a better model, but I was running out of time and wanted to publish this post. Feel free to try it out yourself and trian for longer. You can find the model parameters in `data/trained-model-theano.npz` in the Github repository and load them using the `load_model_parameters_theano` method:

# In[ ]:

from utils import load_model_parameters_theano, save_model_parameters_theano
Exemple #21
0
'''
np.random.seed(10) #FLAG
# Train on a small subset of the data to see what happens
model = RNNumpy(vocabsize)
losses = trainwithsgd(model, Xtrain[:100], ytrain[:100], nepoch=10, evaluate_loss_after=1)



np.random.seed(10) #FLAG
model = RNNTheano(vocabsize)
model.sgd_step(Xtrain[10], ytrain[10], 0.005)
'''

from utils import load_model_parameters_theano, save_model_parameters_theano

model = RNNTheano(vocabsize, hiddendim=50)
# losses = train_with_sgd(model, X_train, y_train, nepoch=50)
# save_model_parameters_theano('./data/trained-model-theano.npz', model)
load_model_parameters_theano('/home/ihasdapie/Documents/AI/Data/trained-model-theano.npz', model)
def generate_sentence(model):
	# We start the sentence with the start token
	new_sentence = [wordtoindex[starttoken]]
	# Repeat until we get an end token
	while not new_sentence[-1] == wordtoindex[endtoken]:
		next_word_probs = model.forward_propagation(new_sentence)
		sampled_word = wordtoindex[unknowntoken]
		# We don't want to sample unknown words
		while sampled_word == wordtoindex[unknowntoken]:
			samples = np.random.multinomial(1, next_word_probs[-1])
			sampled_word = np.argmax(samples)
		new_sentence.append(sampled_word)
Exemple #22
0
class RNNLM:
    def __init__(self):
        self.unknown_token = "UNKNOWN_TOKEN"
        self.sentence_start_token = "SENTENCE_START"
        self.sentence_end_token = "SENTENCE_END"
        self.index_to_word = None
        self.word_to_index = None
        self.model = None

    def tokenize_data(self, n=-1):
        # download dependent nltk resources if you havn't.
        # nltk.download('punkt')

        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading sentences from gutenberg corpus ..."
        from nltk.corpus import gutenberg
        tokenized_sentences = []
        for s in gutenberg.sents('austen-emma.txt'):
            tokenized_sentences.append([self.sentence_start_token] + s[1:-1] +
                                       [self.sentence_end_token])
        print "Parsed %d sentences." % (len(tokenized_sentences))

        if n > 0:
            tokenized_sentences = tokenized_sentences[:n]

        # count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        self.vocabulary_size = int(len(word_freq.items()) * 0.95)

        # get the most common words, treat others words as unknown.
        vocab = word_freq.most_common(self.vocabulary_size - 1)
        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word is '%s' and appeared %d times." % \
              (vocab[-1][0], vocab[-1][1])
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict([(w, i)
                                   for i, w in enumerate(self.index_to_word)])

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [
                w if w in self.word_to_index else self.unknown_token
                for w in sent
            ]

        # create training data
        x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]]
                              for sent in tokenized_sentences])
        y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]]
                              for sent in tokenized_sentences])

        print ""
        print "Example sentence: '%s'" % tokenized_sentences[0]
        print "By word indexes: '%s'" % \
              [self.word_to_index[w] for w in tokenized_sentences[0]]

        return (x_train, y_train)

    def train_numpy(self, x_train, y_train, iterations):
        self.model = RNNNumpy(word_dim=self.vocabulary_size,
                              hidden_dim=100,
                              bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim=self.vocabulary_size,
                               hidden_dim=100,
                               bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_lstm_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim=self.vocabulary_size,
                               hidden_dim=100,
                               bptt_truncate=4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def generate_sentence(self):
        # repeat until we get an end token
        sentence_start_idx = self.word_to_index[self.sentence_start_token]
        sentence_end_idx = self.word_to_index[self.sentence_end_token]
        unknown_word_idx = self.word_to_index[self.unknown_token]
        # start the sentence with the start token
        new_sentence = [sentence_start_idx]
        while new_sentence[-1] != sentence_end_idx:
            next_word_probs = self.model.forward_propagation(new_sentence)
            sampled_word = unknown_word_idx
            # skip unknown words
            while sampled_word == unknown_word_idx or \
                  sampled_word == sentence_start_idx:
                samples = np.random.multinomial(1, next_word_probs[0])
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        return new_sentence

    def generate_sentences(self, num_sentences, min_length):
        for i in xrange(num_sentences):
            sent = []
            # We want long sentences, not sentences with one or two words
            while len(sent) < min_length:
                sent = self.generate_sentence()
                sent_str = [self.index_to_word[x] for x in sent[1:-1]]
            print " ".join(sent_str).encode('utf-8')
            print ""
Exemple #23
0
 def train_lstm_theano(self, x_train, y_train, iterations):
     self.model = RNNTheano(word_dim = self.vocabulary_size,
                            hidden_dim = 100, bptt_truncate = 4)
     self.model.sgd(x_train, y_train, 0.01, iterations)
Exemple #24
0
def load_trained_model():
    model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM)
    print 'start loading...'
    load_model_parameters_theano(Config._MODEL_FILE, model)
    print 'load over!'
    return model
Exemple #25
0
 def train_numpy(self, x_train, y_train, iterations):
     self.model = RNNNumpy(word_dim=self.vocabulary_size,
                           hidden_dim=100,
                           bptt_truncate=4)
     self.model.sgd(x_train, y_train, 0.01, iterations)
# Get the most common words and build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
'''
# Create the training data
#X_train = np.asarray([[ord(char)] for char in chars])
#y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])
X_train = np.asarray(sentences_tokens_x)
y_train = np.asarray(sentences_tokens_y)

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
t1 = time.time()

model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
Exemple #27
0
 def train_lstm_theano(self, x_train, y_train, iterations):
     self.model = RNNTheano(word_dim=self.vocabulary_size,
                            hidden_dim=100,
                            bptt_truncate=4)
     self.model.sgd(x_train, y_train, 0.01, iterations)
Exemple #28
0
    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    #print 'Found %d unique words tokens.' % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(_VOCABULARY_SIZE-1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

    with open(_PICKLE_IDX_WRD_FILE, 'wb') as pkl_file:
        pickle.dump(index_to_word, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open(_PICKLE_WRD_IDX_FILE, 'wb') as pkl_file:
        pickle.dump(word_to_index, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)
load_model_parameters_theano(_MODEL_FILE, model)

def clean_up_sentence(sent):
    clean_dict = {' ,': ',',
                  ' !': '!',
                  ' :': ':',
                  ' ?': '?',
                  ' .': '.',
                  ' \'': '\'',
                  ' --':'--'}

    for bad, good in clean_dict.iteritems():
        sent = sent.replace(bad, good)

    pms = clean_dict.values()
Exemple #29
0
ALPHA = 0.015
EPOCHS = 5
HIDDEN_LAYER_SIZE = 50
PRELOAD_WEIGHTS = False

# read the vocab
index_to_word, word_to_index = load_vocab(vocab_file)
VOCAB_SIZE = len(index_to_word)
# adding special symbols
index_to_word[VOCAB_SIZE] = sentence_start_token
index_to_word[VOCAB_SIZE+1] = sentence_end_token
word_to_index[sentence_start_token] = VOCAB_SIZE
word_to_index[sentence_end_token] = VOCAB_SIZE+1


rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = HIDDEN_LAYER_SIZE)
if PRELOAD_WEIGHTS:
    print "preloading weights"
    rnn.preload_weights(weights_file)
    train_loss = []
    test_loss = []
else:
    print "training the model"
    train_loss = []
    test_loss = []
    for e in range(EPOCHS):
        i = 0
        print("--- Epoch "+str(e+1)+" ---")
        train_loss.append(rnn.total_loss(itertools.islice(load_songs(train_file,word_to_index),MAX_L_SENTENCES)))
        test_loss.append(rnn.total_loss(itertools.islice(load_songs(test_file,word_to_index),MAX_L_SENTENCES)))
        sentences = load_songs(train_file,word_to_index)

def generate_sentence(model, index_to_word, word_to_index, min_length=5):
    # We start the sentence with the start token
    new_sentence = [word_to_index[SENTENCE_START_TOKEN]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[SENTENCE_END_TOKEN]:
        next_word_probs = model.predict(new_sentence)[-1]
        samples = np.random.multinomial(1, [next_word_probs])
        sampled_word = np.argmax(samples)
        new_sentence.append(sampled_word)
        # Seomtimes we get stuck if the sentence becomes too long, e.g. "........" :(
        # And: We don't want sentences with UNKNOWN_TOKEN's
        if len(new_sentence) > 100 or sampled_word == word_to_index[UNKNOWN_TOKEN]:
            return None
    if len(new_sentence) < min_length:
        return None
    return new_sentence


# cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)")
# print("----------------------------------------------------------------")
np.random.seed(10)
model_theano = RNNTheano(vocabulary_size)
# cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)")
print ("----------------------------------------------------------------")
# losses_numpy = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1)
losses_theano = model_theano.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1)
generated_sentence = generate_sentence(model, index_to_word, word_to_index)
print generated_sentence
Exemple #31
0
class LM_With_RNN:
    def __init__(self, texts):

        # Create the training data
        xy = self.preprocess_text(texts);
        self.X_train = xy['x'];
        self.y_train = xy['y'];

        self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)

        self.train_with_sgd()

    def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE):
        # We keep track of the losses so we can plot them later
        losses = []
        num_examples_seen = 0
        for epoch in range(nepoch):
            # Optionally evaluate the loss
            if (epoch % evaluate_loss_after == 0):
                loss = self.model.calculate_loss(self.X_train, self.y_train)
                losses.append((num_examples_seen, loss))
                # Adjust the learning rate if loss increases
                if (len(losses) > 1 and losses[-1][1] > losses[-2][1]):
                    learning_rate = learning_rate * 0.5
            # For each training example...
            for i in range(len(self.y_train)):
                # One SGD step
                self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate)
                num_examples_seen += 1
        return self.model;

    def calculate_score(self, text):
        texts = [text];
        xy = self.preprocess_text(texts);
        X_train = xy['x'];
        y_train = xy['y'];
        o = self.model.forward_propagation(X_train[0])
        p = 0;
        i = -1;
        for w in X_train[0]:
            i += 1;
            p += -1 * np.log10(o[i][w])
        return p;

    def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE):
        unknown_token = "UNKNOWN_TOKEN"
        sentence_start_token = "SENTENCE_START"
        sentence_end_token = "SENTENCE_END"

        # Split full comments into sentences
        # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts])
        sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts])
        # Append SENTENCE_START and SENTENCE_END
        sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences]

        # Tokenize the sentences into words
        tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences]

        # Count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))

        # Get the most common words and build index_to_word and word_to_index vectors
        if(vocabulary_size==-1):
            vocab = word_freq.elements();
        else:
            vocab = word_freq.most_common(vocabulary_size - 1)
        index_to_word = [x[0] for x in vocab]
        index_to_word.append(unknown_token)
        word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)])

        # Replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]

        # Create the training data
        X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
        y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

        return {
            'x': X_train,
            'y': y_train,
            'index_to_word': index_to_word,
            'word_to_index': word_to_index
        };
Exemple #32
0
_LEARNING_RATE = float(os.environ.get('LEARNING_RATE', '0.005'))
_NEPOCH = int(os.environ.get('NEPOCH', '100'))
_MODEL_FILE = os.environ.get('MODEL_FILE')

vocabulary_size = _VOCABULARY_SIZE
unknown_token = "UNKNOWN_TOKEN"
sentence_start_token = "SENTENCE_START"
sentence_end_token = "SENTENCE_END"

print "Reading the vocabulary..."
with open('data/vocab', 'r') as vfile:
    index_to_word, word_to_index = pickle.load(vfile)

print "Using vocabulary size %d." % vocabulary_size

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
load_model_parameters_theano(
    './data/rnn-theano-80-8000-2017-07-14-01-15-51.npz', model)


def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 100
np.random.seed(10)
model = RNNNumpy(grad_check_vocab_size, 10, bptt_truncate=1000)
model.gradient_check([0, 1, 2, 3], [1, 2, 3, 4])

np.random.seed(10)
model = RNNNumpy(vocabulary_size)
t0 = time()
model.sgd_step(X_train[10], y_train[10], 0.005)
print time() - t0

np.random.seed(10)
# Train on a small subset of the data to see what happens
model = RNNTheano(vocabulary_size)
# load_model_parameters_theano("model.data", model)
# train_with_sgd(model, X_train, y_train, nepoch=100, evaluate_loss_after=10)
# model = RNNNumpy(vocabulary_size)
train_with_sgd(model, X_train, y_train, nepoch=10, evaluate_loss_after=1)
# train_with_sgd(model, X_train[:100], y_train[:100], nepoch=10, evaluate_loss_after=1)
# with open("model.data", "w") as out_file:
#     save_model_parameters_theano(out_file, model)


def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [char_map[sentence_start]]
    # Repeat until we get an end token
    while not new_sentence[-1] == char_map[sentence_end]:
        next_word_probs = model.forward_propagation(new_sentence)
Exemple #34
0
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (
    vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [
        w if w in word_to_index else unknown_token for w in sent
    ]

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
t1 = time.time()
model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model,
               X_train,
               y_train,
               nepoch=_NEPOCH,
               learning_rate=_LEARNING_RATE)
Exemple #35
0
class RNNLM:
    def __init__(self):
        self.unknown_token = "UNKNOWN_TOKEN"
        self.sentence_start_token = "SENTENCE_START"
        self.sentence_end_token = "SENTENCE_END"
        self.index_to_word = None
        self.word_to_index = None
        self.model = None

    def tokenize_data(self, n = -1):
        # download dependent nltk resources if you havn't.
        # nltk.download('punkt')

        # Read the data and append SENTENCE_START and SENTENCE_END tokens
        print "Reading sentences from gutenberg corpus ..."
        from nltk.corpus import gutenberg
        tokenized_sentences = []
        for s in gutenberg.sents('austen-emma.txt'):
            tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token])
        print "Parsed %d sentences." % (len(tokenized_sentences))

        if n > 0:
            tokenized_sentences = tokenized_sentences[:n]

        # count the word frequencies
        word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
        print "Found %d unique words tokens." % len(word_freq.items())

        self.vocabulary_size = int(len(word_freq.items()) * 0.95)

        # get the most common words, treat others words as unknown.
        vocab = word_freq.most_common(self.vocabulary_size - 1)
        print "Using vocabulary size %d." % self.vocabulary_size
        print "The least frequent word is '%s' and appeared %d times." % \
              (vocab[-1][0], vocab[-1][1])
        self.index_to_word = [x[0] for x in vocab]
        self.index_to_word.append(self.unknown_token)
        self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)])

        # replace all words not in our vocabulary with the unknown token
        for i, sent in enumerate(tokenized_sentences):
            tokenized_sentences[i] = [w if w in self.word_to_index
                                      else self.unknown_token for w in sent]

        # create training data
        x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]]
                             for sent in tokenized_sentences])
        y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]]
                             for sent in tokenized_sentences])

        print ""
        print "Example sentence: '%s'" % tokenized_sentences[0]
        print "By word indexes: '%s'" % \
              [self.word_to_index[w] for w in tokenized_sentences[0]]

        return (x_train, y_train)

    def train_numpy(self, x_train, y_train, iterations):
        self.model = RNNNumpy(word_dim = self.vocabulary_size,
                              hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim = self.vocabulary_size,
                               hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def train_lstm_theano(self, x_train, y_train, iterations):
        self.model = RNNTheano(word_dim = self.vocabulary_size,
                               hidden_dim = 100, bptt_truncate = 4)
        self.model.sgd(x_train, y_train, 0.01, iterations)

    def generate_sentence(self):
        # repeat until we get an end token
        sentence_start_idx = self.word_to_index[self.sentence_start_token]
        sentence_end_idx = self.word_to_index[self.sentence_end_token]
        unknown_word_idx = self.word_to_index[self.unknown_token]
        # start the sentence with the start token
        new_sentence = [sentence_start_idx]
        while new_sentence[-1] != sentence_end_idx:
            next_word_probs = self.model.forward_propagation(new_sentence)
            sampled_word = unknown_word_idx
            # skip unknown words
            while sampled_word == unknown_word_idx or \
                  sampled_word == sentence_start_idx:
                samples = np.random.multinomial(1, next_word_probs[0])
                sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
        return new_sentence

    def generate_sentences(self, num_sentences, min_length):
        for i in xrange(num_sentences):
            sent = []
            # We want long sentences, not sentences with one or two words
            while len(sent) < min_length:
                sent = self.generate_sentence()
                sent_str = [self.index_to_word[x] for x in sent[1:-1]]
            print " ".join(sent_str).encode('utf-8')
            print ""