コード例 #1
0
ファイル: train-theano.py プロジェクト: jimmyyentran/kaggle
 def import_model(model_location, csv_file=_CSV_FILE):
     m = re.findall('\d+', model_location)
     vocab_size = int(m[1])
     hidden_d = int(m[0])
     print "Vocab size= %d, hidden dimensions= %d" % (vocab_size, hidden_d)
     model2 = RNNTheano(vocab_size, hidden_dim=hidden_d)
     load_model_parameters_theano(model_location, model2)
     _, idx_to_word, word_to_idx = load_data_set(csv_file, vocab_size)
     return model2, idx_to_word, word_to_idx
コード例 #2
0
    def __init__(self, texts):

        # Create the training data
        xy = self.preprocess_text(texts);
        self.X_train = xy['x'];
        self.y_train = xy['y'];

        self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)

        self.train_with_sgd()
コード例 #3
0
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000):
    model = RNNTheano(vocab_size, hidden_dim=hidden_dim)
    t1 = time.time()
    model.sgd_step(x_train[10], y_train[10], learning_rate)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    if load_model_file != "":
        print "loading model: %s" % load_model_file
        load_model_parameters_theano(load_model_file, model)

    train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
コード例 #4
0
def train_theano():
    model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM)
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

    model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE)

    if Config._MODEL_FILE != None:
        print "start saving model..."
        save_model_parameters_theano(Config._MODEL_FILE, model)
        print "model saved!"
コード例 #5
0
def generate_examples(model_name, index_to_word, word_to_index, vocab_size=8000, hidden_dim=100, num_sentences=10, sentences_min_length=4):

    model = RNNTheano(vocab_size, hidden_dim)
    load_model_parameters_theano(model_name, model)
    sentences = []

    for i in range(num_sentences):
        sent = []
        while len(sent) < sentences_min_length:
            sent = generate_sentence(model, index_to_word, word_to_index)
        print " ".join(sent)

    while len(sentences) < num_sentences:
        sent = generate_sentence(model, index_to_word, word_to_index)
        if len(sent) >= sentences_min_length: sentences.append(sent)
コード例 #6
0
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences])

# Print an training data example
x_example, y_example = X_train[17], y_train[17]
print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example))
print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example))

#Training our Network with Theano and the GPU

# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

np.random.seed(10)
model = RNNTheano(vocabulary_size)
model.sgd_step(X_train[10], y_train[10], 0.005)


# Run the model
model = RNNTheano(vocabulary_size, hidden_dim=50)
load_model_parameters_theano('./data/trained-model-theano.npz', model)

def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
コード例 #7
0
def load_trained_model():
    model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM)
    print 'start loading...'
    load_model_parameters_theano(Config._MODEL_FILE, model)
    print 'load over!'
    return model
コード例 #8
0
num_sentences = 10
senten_min_length = 7

for i in range(num_sentences):
    sent = []
    # We want long sentences, not sentences with one or two words
    while len(sent) < senten_min_length:
        sent = generate_sentence(model)
    print " ".join(sent)

## Evaluting on Theano with CPU

import os
os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32"

import theano
import theano.tensor as T
from utils import *

from rnn_theano import RNNTheano, gradient_check_theano

np.random.seed(10)
# To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking.
grad_check_vocab_size = 5
model = RNNTheano(grad_check_vocab_size, 10)
gradient_check_theano(model, [0,1,2,3], [1,2,3,4])

## Without Theano in fast mode it took 1m 12s. Python alone took 750 ms
np.random.seed(20)
model = RNNTheano(vocabulary_size)
%timeit model.sgd_step(X_train[10], y_train[10], 0.005)
コード例 #9
0
np.save('data/ixtoword.npy', index_to_word)

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokens])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokens])
feat_path = './data/feats.npy'
feats = np.load(feat_path)
dim_embed = 256
dim_hidden = 256
dim_image = 4096
feats = np.load(feat_path)
feats = feats[:158900]

encode_img_W = np.random.uniform(-0.1, 0.1, (dim_image, dim_hidden))
encode_img_b = np.zeros((dim_hidden))

bv = feats * encoding_img_W + encoding_img_b

model = RNNTheano(len(index_to_word), hidden_dim=dim_hidden)
t1 = time.time()
model.sgd_step(X_train[10], bv[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model, X_train, bv, y_train, nepoch=1000, learning_rate=0.01)
コード例 #10
0
        w if w in word_to_index else unknown_token for w in sent
    ]

print("\nExample sentence: '%s'" % sentences[0])
print("\nExample sentence after pre-processing: '%s'" % tokenized_sentences[0])

# Create the training data
# Note that the length of each sentence is different
# X_train - every words of each sentence except for the last one
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
# y_train - every words except for the first one
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

model = RNNTheano(vocabulary_size, hidden_dim=50)
load_model_parameters_theano('./trained-model-theano.npz', model)


def generate_sentence(model):
    # We start the sentence with the start token
    new_sentence = [word_to_index[sentence_start_token]]
    # Repeat until we get an end token
    while not new_sentence[-1] == word_to_index[sentence_end_token]:
        next_word_probs = model.forward_propagation(new_sentence)
        sampled_word = word_to_index[unknown_token]
        # We don't want to sample unknown words
        while sampled_word == word_to_index[unknown_token]:
            samples = np.random.multinomial(1, next_word_probs[-1])
            sampled_word = np.argmax(samples)
            new_sentence.append(sampled_word)
コード例 #11
0
ファイル: run_songs.py プロジェクト: abrazinskas/Theano-RNN
ALPHA = 0.015
EPOCHS = 5
HIDDEN_LAYER_SIZE = 50
PRELOAD_WEIGHTS = False

# read the vocab
index_to_word, word_to_index = load_vocab(vocab_file)
VOCAB_SIZE = len(index_to_word)
# adding special symbols
index_to_word[VOCAB_SIZE] = sentence_start_token
index_to_word[VOCAB_SIZE+1] = sentence_end_token
word_to_index[sentence_start_token] = VOCAB_SIZE
word_to_index[sentence_end_token] = VOCAB_SIZE+1


rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = HIDDEN_LAYER_SIZE)
if PRELOAD_WEIGHTS:
    print "preloading weights"
    rnn.preload_weights(weights_file)
    train_loss = []
    test_loss = []
else:
    print "training the model"
    train_loss = []
    test_loss = []
    for e in range(EPOCHS):
        i = 0
        print("--- Epoch "+str(e+1)+" ---")
        train_loss.append(rnn.total_loss(itertools.islice(load_songs(train_file,word_to_index),MAX_L_SENTENCES)))
        test_loss.append(rnn.total_loss(itertools.islice(load_songs(test_file,word_to_index),MAX_L_SENTENCES)))
        sentences = load_songs(train_file,word_to_index)
コード例 #12
0
ファイル: rnn_lm.py プロジェクト: rosefun/blog-2
 def train_lstm_theano(self, x_train, y_train, iterations):
     self.model = RNNTheano(word_dim=self.vocabulary_size,
                            hidden_dim=100,
                            bptt_truncate=4)
     self.model.sgd(x_train, y_train, 0.01, iterations)
コード例 #13
0
# creating vocabulary
if not os.path.isfile(vocab_file):
    vocab = construct_vocabulary(train_file)
    write_vocabulary(vocab, vocab_file)

# read the vocab
index_to_word, word_to_index = read_vocabulary(vocab_file, 8000)
# adding special symbols
index_to_word.append(sentence_end_token)
index_to_word.append(sentence_start_token)
word_to_index[sentence_start_token] = VOCAB_SIZE + 1
word_to_index[sentence_end_token] = VOCAB_SIZE + 2

if THEANO:
    rnn = RNNTheano(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=50)
else:
    rnn = RNN(VOCAB_SIZE + SPEC_SYMBOLS_COUNT,
              VOCAB_SIZE + SPEC_SYMBOLS_COUNT,
              hidden_dim=100)
# generate sentences
print("training the model")
loss = [
    rnn.total_loss(
        itertools.islice(tokenize_file(word_to_index, train_file),
                         MAX_L_SENTENCES))
]
for e in range(EPOCHS):
    i = 0
    print("--- Epoch " + str(e + 1) + " ---")
    loss.append(
コード例 #14
0
import preprocess
from rnn_numpy import RNNNumpy
from rnn_theano import RNNTheano
import numpy as np
import cProfile

X_train, y_train, vocabulary_size = preprocess.create_train_data()
np.random.seed(10)
model = RNNNumpy(vocabulary_size)

np.random.seed(10)
model = RNNNumpy(vocabulary_size)
#cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)")
#print("----------------------------------------------------------------")
np.random.seed(10)
model_theano = RNNTheano(vocabulary_size)
#cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)")
print("----------------------------------------------------------------")
losses_numpy = model.train_with_sgd(X_train[:100],
                                    y_train[:100],
                                    nepoch=5,
                                    evaluate_loss_after=1)
losses_theano = model_theano.train_with_sgd(X_train[:100],
                                            y_train[:100],
                                            nepoch=5,
                                            evaluate_loss_after=1)
コード例 #15
0
ファイル: vanillarnn2.py プロジェクト: ihasdapie/text-rnn
'''
np.random.seed(10) #FLAG
# Train on a small subset of the data to see what happens
model = RNNumpy(vocabsize)
losses = trainwithsgd(model, Xtrain[:100], ytrain[:100], nepoch=10, evaluate_loss_after=1)



np.random.seed(10) #FLAG
model = RNNTheano(vocabsize)
model.sgd_step(Xtrain[10], ytrain[10], 0.005)
'''

from utils import load_model_parameters_theano, save_model_parameters_theano

model = RNNTheano(vocabsize, hiddendim=50)
# losses = train_with_sgd(model, X_train, y_train, nepoch=50)
# save_model_parameters_theano('./data/trained-model-theano.npz', model)
load_model_parameters_theano('/home/ihasdapie/Documents/AI/Data/trained-model-theano.npz', model)
def generate_sentence(model):
	# We start the sentence with the start token
	new_sentence = [wordtoindex[starttoken]]
	# Repeat until we get an end token
	while not new_sentence[-1] == wordtoindex[endtoken]:
		next_word_probs = model.forward_propagation(new_sentence)
		sampled_word = wordtoindex[unknowntoken]
		# We don't want to sample unknown words
		while sampled_word == wordtoindex[unknowntoken]:
			samples = np.random.multinomial(1, next_word_probs[-1])
			sampled_word = np.argmax(samples)
		new_sentence.append(sampled_word)
コード例 #16
0
    # Count the word frequencies
    word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    #print 'Found %d unique words tokens.' % len(word_freq.items())

    # Get the most common words and build index_to_word and word_to_index vectors
    vocab = word_freq.most_common(_VOCABULARY_SIZE-1)
    index_to_word = [x[0] for x in vocab]
    index_to_word.append(unknown_token)
    word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

    with open(_PICKLE_IDX_WRD_FILE, 'wb') as pkl_file:
        pickle.dump(index_to_word, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)
    with open(_PICKLE_WRD_IDX_FILE, 'wb') as pkl_file:
        pickle.dump(word_to_index, pkl_file, protocol=pickle.HIGHEST_PROTOCOL)

model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM)
load_model_parameters_theano(_MODEL_FILE, model)

def clean_up_sentence(sent):
    clean_dict = {' ,': ',',
                  ' !': '!',
                  ' :': ':',
                  ' ?': '?',
                  ' .': '.',
                  ' \'': '\'',
                  ' --':'--'}

    for bad, good in clean_dict.iteritems():
        sent = sent.replace(bad, good)

    pms = clean_dict.values()
コード例 #17
0
for i, sent in enumerate(tokenized_sentences):
  tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent]
 
print "\nExample sentence: '%s'" % sentences[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]


# Create the training data
X_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[:-1]])
Y_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[1:]])

print X_train, type(X_train)
print Y_train, type(Y_train)

np.random.seed(10)
model = RNNTheano(vocabulary_size)
# model = RNNNumpy(vocabulary_size)
# o, s = model.forward_propagation(X_train[1])
# print o.shape
# print o
l = [8,9,0,1,2,3,4,5,6,7]
x = np.asarray([np.int32(a) for a in l])
l2 = [3,4,5,9,0,1]
x2 = np.asarray([np.int32(a) for a in l2])
# x = np.asarray([np.int32(a) for a in range(0,10)])
# print x, type(x)
print "input", x, x2
# x[0] = 10
# print x, type(x)
# o = model.forward_propagation(x)
# print "o.shape",(o).shape, o 
コード例 #18
0
# Replace all words not in our vocabulary with the unknown token
# todo needs cleaner text preprocessing
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [
        w if w in word_to_index else unknown_token for w in sent
    ]

# Create the training data
X_train = numpy.asarray([[word_to_index[w] for w in sent[:-1]]
                         for sent in tokenized_sentences])
y_train = numpy.asarray([[word_to_index[w] for w in sent[1:]]
                         for sent in tokenized_sentences])

######################################################################## construct RNN
print "constructing model..."  # todo try a smarter initialization - wrt vanishing gradients
model = RNNTheano(vocabulary_size, hidden_dim=HIDDEN_DIM)

gradient_check_theano(model,
                      X_train[10],
                      y_train[10],
                      h=0.0000001,
                      error_threshold=0.01)

######################################################################## train

if RETRAIN:
    # run a single step to get a feel for training time
    print "run a single step..."
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], LEARNING_RATE)
    t2 = time.time()
コード例 #19
0
                time, num_examples_seen, epoch, accuracy)

        # For each training example (SGD step)...
        for i in range(len(y_train)):
            # One SGD step
            model.sgd_step(X_train[i], y_train[i], learning_rate)
            num_examples_seen += 1


# Create the training data
x, y = cPickle.load(open('OnlyNPs_codeonly.cPickle', 'rb'))
X_train = np.array(x, dtype='float32')
y_train = np.array(y, dtype='float32')

# Specify model and timing one SGD step
model = RNNTheano(10, 333, hidden_dim=30)
#t1 = time.time()
#model.sgd_step(X_train[10], y_train[10], 0.005)
#t2 = time.time()
#print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

# Train model for n epoches
train_with_sgd(model,
               X_train,
               y_train,
               nepoch=150,
               learning_rate=0.01,
               evaluate_loss_after=1)

# Save model parameters
save_model_parameters_theano('model_parameters_OnlyNPs', model)
コード例 #20
0
ファイル: train-theano.py プロジェクト: altair15/LSTM
print "Using vocabulary size %d." % vocabulary_size
print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (
    vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_sentences):
    tokenized_sentences[i] = [
        w if w in word_to_index else unknown_token for w in sent
    ]

# Create the training data
X_train = np.asarray([[word_to_index[w] for w in sent[:-1]]
                      for sent in tokenized_sentences])
y_train = np.asarray([[word_to_index[w] for w in sent[1:]]
                      for sent in tokenized_sentences])

model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM)
t1 = time.time()
model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE)
t2 = time.time()
print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)

if _MODEL_FILE != None:
    load_model_parameters_theano(_MODEL_FILE, model)

train_with_sgd(model,
               X_train,
               y_train,
               nepoch=_NEPOCH,
               learning_rate=_LEARNING_RATE)
コード例 #21
0
    print "Expected Loss for random predictions: %f" % np.log(model.word_dim)
    print "Actual loss: %f" % model.calculate_loss(X_train[:100],
                                                   y_train[:100])


def test_performance(model, learning_rate):
    print "\ntest performance: " + str(type(model))
    t1 = time.time()
    model.sgd_step(X_train[10], y_train[10], learning_rate)
    t2 = time.time()
    print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.)


model_gru = GRUTheano(word_dim=_VOCABULARY_SIZE,
                      hidden_dim=_HIDDEN_DIM,
                      bptt_truncate=-1)
model_theano = RNNTheano(word_dim=_VOCABULARY_SIZE,
                         hidden_dim=_HIDDEN_DIM,
                         bptt_truncate=-1)
model_rnn = RNNNumpy(word_dim=_VOCABULARY_SIZE,
                     hidden_dim=_HIDDEN_DIM,
                     bptt_truncate=-1)

test_performance(model_gru, _LEARNING_RATE)
test_performance(model_theano, _LEARNING_RATE)
test_performance(model_rnn, _LEARNING_RATE)

test_loss(model_gru)
test_loss(model_theano)
test_loss(model_rnn)