def train(X_train,y_train,vocabulary_size,hiddenDim,modelFiles): model = RNNTheano(vocabulary_size, hidden_dim=hiddenDim) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if modelFiles != None: load_model_parameters_theano(modelFiles, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
def train_model(x_train, y_train, training_data_name="training_data_name", load_model_file="", num_epochs=50, learning_rate=0.010, hidden_dim=100, vocab_size=8000): model = RNNTheano(vocab_size, hidden_dim=hidden_dim) t1 = time.time() model.sgd_step(x_train[10], y_train[10], learning_rate) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if load_model_file != "": print "loading model: %s" % load_model_file load_model_parameters_theano(load_model_file, model) train_with_sgd(model, x_train, y_train, nepoch=num_epochs, learning_rate=learning_rate, training_data_name=training_data_name)
def train_theano(): model = RNNTheano(Config._VOCABULARY_SIZE, hidden_dim=Config._HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], Config._LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) model.train_with_sgd(X_train, y_train, nepoch=Config._NEPOCH, learning_rate=Config._LEARNING_RATE) if Config._MODEL_FILE != None: print "start saving model..." save_model_parameters_theano(Config._MODEL_FILE, model) print "model saved!"
print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % ( vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in word_to_index else unknown_token for w in sent ] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
# Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(vocabulary_size-1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)]) print "Using vocabulary size %d." % vocabulary_size print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] ''' # Create the training data #X_train = np.asarray([[ord(char)] for char in chars]) #y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) X_train = np.asarray(sentences_tokens_x) y_train = np.asarray(sentences_tokens_y) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) t1 = time.time() model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) t2 = time.time() print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE)
# Print an training data example x_example, y_example = X_train[17], y_train[17] print ("x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)) print ("\ny:\n%s\n%s" % (" ".join([index_to_word[x] for x in y_example]), y_example)) #Training our Network with Theano and the GPU # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) np.random.seed(10) model = RNNTheano(vocabulary_size) model.sgd_step(X_train[10], y_train[10], 0.005) # Run the model model = RNNTheano(vocabulary_size, hidden_dim=50) load_model_parameters_theano('./data/trained-model-theano.npz', model) def generate_sentence(model): # We start the sentence with the start token new_sentence = [word_to_index[sentence_start_token]] # Repeat until we get an end token while not new_sentence[-1] == word_to_index[sentence_end_token]: next_word_probs = model.forward_propagation(new_sentence) sampled_word = word_to_index[unknown_token] # We don't want to sample unknown words while sampled_word == word_to_index[unknown_token]:
# ========================================================================== # ================= USE THE GUYs RNN CLASS FOR THEANO: ===================== # ========================================================================== from rnn_theano import RNNTheano, gradient_check_theano np.random.seed(10) # To avoid performing millions of expensive calculations we use a smaller vocabulary size for checking. grad_check_vocab_size = 5 model = RNNTheano(grad_check_vocab_size, 10) gradient_check_theano(model, [0,1,2,3], [1,2,3,4]) np.random.seed(10) model = RNNTheano(vocabulary_size) %timeit model.sgd_step(X_train[10], y_train[10], 0.005) # LOAD the model parameters that he trained: model = RNNTheano(vocabulary_size, hidden_dim=50) utils.load_model_parameters_theano('./data/trained-model-theano.npz', model) # TRAIN the model if wanted, but he said he trained his for 20hrs: # losses = train_with_sgd(model, X_train, y_train, nepoch=50) # save_model_parameters_theano('./data/trained-model-theano.npz', model) # ====================================================== # ================= BUILD OWN RNN: ===================== # ====================================================== ''' The input x = X-train[i] will be a sequence of words
class LM_With_RNN: def __init__(self, texts): # Create the training data xy = self.preprocess_text(texts); self.X_train = xy['x']; self.y_train = xy['y']; self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) self.train_with_sgd() def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE): # We keep track of the losses so we can plot them later losses = [] num_examples_seen = 0 for epoch in range(nepoch): # Optionally evaluate the loss if (epoch % evaluate_loss_after == 0): loss = self.model.calculate_loss(self.X_train, self.y_train) losses.append((num_examples_seen, loss)) # Adjust the learning rate if loss increases if (len(losses) > 1 and losses[-1][1] > losses[-2][1]): learning_rate = learning_rate * 0.5 # For each training example... for i in range(len(self.y_train)): # One SGD step self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate) num_examples_seen += 1 return self.model; def calculate_score(self, text): texts = [text]; xy = self.preprocess_text(texts); X_train = xy['x']; y_train = xy['y']; o = self.model.forward_propagation(X_train[0]) p = 0; i = -1; for w in X_train[0]: i += 1; p += -1 * np.log10(o[i][w]) return p; def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE): unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" # Split full comments into sentences # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts]) sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts]) # Append SENTENCE_START and SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # Get the most common words and build index_to_word and word_to_index vectors if(vocabulary_size==-1): vocab = word_freq.elements(); else: vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) return { 'x': X_train, 'y': y_train, 'index_to_word': index_to_word, 'word_to_index': word_to_index };