def import_model(model_location, csv_file=_CSV_FILE): m = re.findall('\d+', model_location) vocab_size = int(m[1]) hidden_d = int(m[0]) print "Vocab size= %d, hidden dimensions= %d" % (vocab_size, hidden_d) model2 = RNNTheano(vocab_size, hidden_dim=hidden_d) load_model_parameters_theano(model_location, model2) _, idx_to_word, word_to_idx = load_data_set(csv_file, vocab_size) return model2, idx_to_word, word_to_idx
def __init__(self, texts): # Create the training data xy = self.preprocess_text(texts); self.X_train = xy['x']; self.y_train = xy['y']; self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) self.train_with_sgd()
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000): model = RNNTheano(vocab_size, hidden_dim=hidden_dim) t1 = time.time() model.sgd_step(x_train[10], y_train[10], learning_rate) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if load_model_file != "": print "loading model: %s" % load_model_file load_model_parameters_theano(load_model_file, model) train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
def train_theano(): model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE) if Config._MODEL_FILE != None: print "start saving model..." save_model_parameters_theano(Config._MODEL_FILE, model) print "model saved!"
def generate_examples(model_name, index_to_word, word_to_index, vocab_size=8000, hidden_dim=100, num_sentences=10, sentences_min_length=4): model = RNNTheano(vocab_size, hidden_dim) load_model_parameters_theano(model_name, model) sentences = [] for i in range(num_sentences): sent = [] while len(sent) < sentences_min_length: sent = generate_sentence(model, index_to_word, word_to_index) print " ".join(sent) while len(sentences) < num_sentences: sent = generate_sentence(model, index_to_word, word_to_index) if len(sent) >= sentences_min_length: sentences.append(sent)
print("\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0]) # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) # Print an training data example x_example, y_example = X_train[17], y_train[17] print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)) print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)) #Training our Network with Theano and the GPU # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) np.random.seed(10) model = RNNTheano(vocabulary_size) model.sgd_step(X_train[10], y_train[10], 0.005) # Run the model model = RNNTheano(vocabulary_size, hidden_dim=50) load_model_parameters_theano('./data/trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token
def load_trained_model(): model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM) print 'start loading...' load_model_parameters_theano(Config._MODEL_FILE, model) print 'load over!' return model
num_sentences = 10 senten_min_length = 7 for i in range(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < senten_min_length: sent = generate_sentence(model) print " ".join(sent) ## Evaluting on Theano with CPU import os os.environ["THEANO_FLAGS"] = "mode=FAST_RUN,device=cpu,floatX=float32" import theano import theano.tensor as T from utils import * from rnn_theano import RNNTheano, gradient_check_theano np.random.seed(10) # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) ## Without Theano in fast mode it took 1m 12s. Python alone took 750 ms np.random.seed(20) model = RNNTheano(vocabulary_size) %timeit model.sgd_step(X_train[10], y_train[10], 0.005)
np.save('data/ixtoword.npy', index_to_word) # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokens]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokens]) feat_path = './data/feats.npy' feats = np.load(feat_path) dim_embed = 256 dim_hidden = 256 dim_image = 4096 feats = np.load(feat_path) feats = feats[:158900] encode_img_W = np.random.uniform(-0.1, 0.1, (dim_image, dim_hidden)) encode_img_b = np.zeros((dim_hidden)) bv = feats * encoding_img_W + encoding_img_b model = RNNTheano(len(index_to_word), hidden_dim=dim_hidden) t1 = time.time() model.sgd_step(X_train[10], bv[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, bv, y_train, nepoch=1000, learning_rate=0.01)
w if w in word_to_index else unknown_token for w in sent ] print("\nExample sentence: '%s'" % sentences[0]) print("\nExample sentence after pre-processing: '%s'" % tokenized_sentences[0]) # Create the training data # Note that the length of each sentence is different # X_train - every words of each sentence except for the last one X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) # y_train - every words except for the first one y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=50) load_model_parameters_theano('./trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[sentence_end_token]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = word_to_index[unknown_token] # We don't want to sample unknown words while sampled_word == word_to_index[unknown_token]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word)
ALPHA = 0.015 EPOCHS = 5 HIDDEN_LAYER_SIZE = 50 PRELOAD_WEIGHTS = False # read the vocab index_to_word, word_to_index = load_vocab(vocab_file) VOCAB_SIZE = len(index_to_word) # adding special symbols index_to_word[VOCAB_SIZE] = sentence_start_token index_to_word[VOCAB_SIZE+1] = sentence_end_token word_to_index[sentence_start_token] = VOCAB_SIZE word_to_index[sentence_end_token] = VOCAB_SIZE+1 rnn = RNNTheano(VOCAB_SIZE+SPEC_SYMBOLS_COUNT, hidden_dim = HIDDEN_LAYER_SIZE) if PRELOAD_WEIGHTS: print "preloading weights" rnn.preload_weights(weights_file) train_loss = [] test_loss = [] else: print "training the model" train_loss = [] test_loss = [] for e in range(EPOCHS): i = 0 print("--- Epoch "+str(e+1)+" ---") train_loss.append(rnn.total_loss(itertools.islice(load_songs(train_file,word_to_index),MAX_L_SENTENCES))) test_loss.append(rnn.total_loss(itertools.islice(load_songs(test_file,word_to_index),MAX_L_SENTENCES))) sentences = load_songs(train_file,word_to_index)
def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations)
# creating vocabulary if not os.path.isfile(vocab_file): vocab = construct_vocabulary(train_file) write_vocabulary(vocab, vocab_file) # read the vocab index_to_word, word_to_index = read_vocabulary(vocab_file, 8000) # adding special symbols index_to_word.append(sentence_end_token) index_to_word.append(sentence_start_token) word_to_index[sentence_start_token] = VOCAB_SIZE + 1 word_to_index[sentence_end_token] = VOCAB_SIZE + 2 if THEANO: rnn = RNNTheano(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=50) else: rnn = RNN(VOCAB_SIZE + SPEC_SYMBOLS_COUNT, VOCAB_SIZE + SPEC_SYMBOLS_COUNT, hidden_dim=100) # generate sentences print("training the model") loss = [ rnn.total_loss( itertools.islice(tokenize_file(word_to_index, train_file), MAX_L_SENTENCES)) ] for e in range(EPOCHS): i = 0 print("--- Epoch " + str(e + 1) + " ---") loss.append(
import preprocess from rnn_numpy import RNNNumpy from rnn_theano import RNNTheano import numpy as np import cProfile X_train, y_train, vocabulary_size = preprocess.create_train_data() np.random.seed(10) model = RNNNumpy(vocabulary_size) np.random.seed(10) model = RNNNumpy(vocabulary_size) #cProfile.run("model.numpy_sdg_step(X_train[10], y_train[10], 0.005)") #print("----------------------------------------------------------------") np.random.seed(10) model_theano = RNNTheano(vocabulary_size) #cProfile.run("model_theano.train_with_sgd(X_train[10], y_train[10], 0.005)") print("----------------------------------------------------------------") losses_numpy = model.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1) losses_theano = model_theano.train_with_sgd(X_train[:100], y_train[:100], nepoch=5, evaluate_loss_after=1)
''' np.random.seed(10) #FLAG # Train on a small subset of the data to see what happens model = RNNumpy(vocabsize) losses = trainwithsgd(model, Xtrain[:100], ytrain[:100], nepoch=10, evaluate_loss_after=1) np.random.seed(10) #FLAG model = RNNTheano(vocabsize) model.sgd_step(Xtrain[10], ytrain[10], 0.005) ''' from utils import load_model_parameters_theano, save_model_parameters_theano model = RNNTheano(vocabsize, hiddendim=50) # losses = train_with_sgd(model, X_train, y_train, nepoch=50) # save_model_parameters_theano('./data/trained-model-theano.npz', model) load_model_parameters_theano('/home/ihasdapie/Documents/AI/Data/trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [wordtoindex[starttoken]] # Repeat until we get an end token while not new_sentence[-1] == wordtoindex[endtoken]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = wordtoindex[unknowntoken] # We don't want to sample unknown words while sampled_word == wordtoindex[unknowntoken]: samples = np.random.multinomial(1, next_word_probs[-1]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word)
# Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) #print 'Found %d unique words tokens.' % len(word_freq.items()) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(_VOCABULARY_SIZE-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) with open(_PICKLE_IDX_WRD_FILE, 'wb') as pkl_file: pickle.dump(index_to_word, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) with open(_PICKLE_WRD_IDX_FILE, 'wb') as pkl_file: pickle.dump(word_to_index, pkl_file, protocol=pickle.HIGHEST_PROTOCOL) model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) load_model_parameters_theano(_MODEL_FILE, model) def clean_up_sentence(sent): clean_dict = {' ,': ',', ' !': '!', ' :': ':', ' ?': '?', ' .': '.', ' \'': '\'', ' --':'--'} for bad, good in clean_dict.iteritems(): sent = sent.replace(bad, good) pms = clean_dict.values()
for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] print "\nExample sentence: '%s'" % sentences[0] print "\nExample sentence after Pre-processing: '%s'" % tokenized_sentences[0] # Create the training data X_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[:-1]]) Y_train = np.asarray([[np.int32(word_to_index[w]) for w in sent] for sent in tokenized_sentences[1:]]) print X_train, type(X_train) print Y_train, type(Y_train) np.random.seed(10) model = RNNTheano(vocabulary_size) # model = RNNNumpy(vocabulary_size) # o, s = model.forward_propagation(X_train[1]) # print o.shape # print o l = [8,9,0,1,2,3,4,5,6,7] x = np.asarray([np.int32(a) for a in l]) l2 = [3,4,5,9,0,1] x2 = np.asarray([np.int32(a) for a in l2]) # x = np.asarray([np.int32(a) for a in range(0,10)]) # print x, type(x) print "input", x, x2 # x[0] = 10 # print x, type(x) # o = model.forward_propagation(x) # print "o.shape",(o).shape, o
# Replace all words not in our vocabulary with the unknown token # todo needs cleaner text preprocessing for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] # Create the training data X_train = numpy.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = numpy.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) ######################################################################## construct RNN print "constructing model..." # todo try a smarter initialization - wrt vanishing gradients model = RNNTheano(vocabulary_size, hidden_dim=HIDDEN_DIM) gradient_check_theano(model, X_train[10], y_train[10], h=0.0000001, error_threshold=0.01) ######################################################################## train if RETRAIN: # run a single step to get a feel for training time print "run a single step..." t1 = time.time() model.sgd_step(X_train[10], y_train[10], LEARNING_RATE) t2 = time.time()
time, num_examples_seen, epoch, accuracy) # For each training example (SGD step)... for i in range(len(y_train)): # One SGD step model.sgd_step(X_train[i], y_train[i], learning_rate) num_examples_seen += 1 # Create the training data x, y = cPickle.load(open('OnlyNPs_codeonly.cPickle', 'rb')) X_train = np.array(x, dtype='float32') y_train = np.array(y, dtype='float32') # Specify model and timing one SGD step model = RNNTheano(10, 333, hidden_dim=30) #t1 = time.time() #model.sgd_step(X_train[10], y_train[10], 0.005) #t2 = time.time() #print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) # Train model for n epoches train_with_sgd(model, X_train, y_train, nepoch=150, learning_rate=0.01, evaluate_loss_after=1) # Save model parameters save_model_parameters_theano('model_parameters_OnlyNPs', model)
print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % ( vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
print "Expected Loss for random predictions: %f" % np.log(model.word_dim) print "Actual loss: %f" % model.calculate_loss(X_train[:100], y_train[:100]) def test_performance(model, learning_rate): print "\ntest performance: " + str(type(model)) t1 = time.time() model.sgd_step(X_train[10], y_train[10], learning_rate) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) model_gru = GRUTheano(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) model_theano = RNNTheano(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) model_rnn = RNNNumpy(word_dim=_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM, bptt_truncate=-1) test_performance(model_gru, _LEARNING_RATE) test_performance(model_theano, _LEARNING_RATE) test_performance(model_rnn, _LEARNING_RATE) test_loss(model_gru) test_loss(model_theano) test_loss(model_rnn)