class LM_With_RNN: def __init__(self, texts): # Create the training data xy = self.preprocess_text(texts); self.X_train = xy['x']; self.y_train = xy['y']; self.model = RNNTheano(_VOCABULARY_SIZE, hidden_dim=_HIDDEN_DIM) self.train_with_sgd() def train_with_sgd(self, nepoch=_NEPOCH, evaluate_loss_after=5, learning_rate=_LEARNING_RATE): # We keep track of the losses so we can plot them later losses = [] num_examples_seen = 0 for epoch in range(nepoch): # Optionally evaluate the loss if (epoch % evaluate_loss_after == 0): loss = self.model.calculate_loss(self.X_train, self.y_train) losses.append((num_examples_seen, loss)) # Adjust the learning rate if loss increases if (len(losses) > 1 and losses[-1][1] > losses[-2][1]): learning_rate = learning_rate * 0.5 # For each training example... for i in range(len(self.y_train)): # One SGD step self.model.sgd_step(self.X_train[i], self.y_train[i], learning_rate) num_examples_seen += 1 return self.model; def calculate_score(self, text): texts = [text]; xy = self.preprocess_text(texts); X_train = xy['x']; y_train = xy['y']; o = self.model.forward_propagation(X_train[0]) p = 0; i = -1; for w in X_train[0]: i += 1; p += -1 * np.log10(o[i][w]) return p; def preprocess_text(self, texts, vocabulary_size=_VOCABULARY_SIZE): unknown_token = "UNKNOWN_TOKEN" sentence_start_token = "SENTENCE_START" sentence_end_token = "SENTENCE_END" # Split full comments into sentences # sentences = itertools.chain(*[nltk.sent_tokenize(x.decode('utf-8').lower()) for x in texts]) sentences = itertools.chain(*[nltk.sent_tokenize(x.lower()) for x in texts]) # Append SENTENCE_START and SENTENCE_END sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] # Tokenize the sentences into words tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) # Get the most common words and build index_to_word and word_to_index vectors if(vocabulary_size==-1): vocab = word_freq.elements(); else: vocab = word_freq.most_common(vocabulary_size - 1) index_to_word = [x[0] for x in vocab] index_to_word.append(unknown_token) word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) # Replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] # Create the training data X_train = np.asarray([[word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) return { 'x': X_train, 'y': y_train, 'index_to_word': index_to_word, 'word_to_index': word_to_index };
y_train = np.asarray([[word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) model = RNNTheano(vocabulary_size, hidden_dim=_HIDDEN_DIM) # t1 = time.time() # model.sgd_step(X_train[10], y_train[10], _LEARNING_RATE) # t2 = time.time() # print "SGD Step time: %f milliseconds" % ((t2 - t1) * 1000.) if _MODEL_FILE != None: load_model_parameters_theano(_MODEL_FILE, model) # train_with_sgd(model, X_train, y_train, nepoch=_NEPOCH, learning_rate=_LEARNING_RATE) o = model.forward_propagation(X_train[1]) print o print [index_to_word[x] for x in X_train[1]]; p = 1; i = -1; for w in X_train[1]: i += 1; p *= o[i][w] print p; # def generate_sentence(model): # # We start the sentence with the start token # new_sentence = [word_to_index[sentence_start_token]] # # Repeat until we get an end token # while not new_sentence[-1] == word_to_index[sentence_end_token]: # next_word_probs = model.forward_propagation(new_sentence)
class RNNLM: def __init__(self): self.unknown_token = "UNKNOWN_TOKEN" self.sentence_start_token = "SENTENCE_START" self.sentence_end_token = "SENTENCE_END" self.index_to_word = None self.word_to_index = None self.model = None def tokenize_data(self, n=-1): # download dependent nltk resources if you havn't. # nltk.download('punkt') # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading sentences from gutenberg corpus ..." from nltk.corpus import gutenberg tokenized_sentences = [] for s in gutenberg.sents('austen-emma.txt'): tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token]) print "Parsed %d sentences." % (len(tokenized_sentences)) if n > 0: tokenized_sentences = tokenized_sentences[:n] # count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) self.vocabulary_size = int(len(word_freq.items()) * 0.95) # get the most common words, treat others words as unknown. vocab = word_freq.most_common(self.vocabulary_size - 1) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word is '%s' and appeared %d times." % \ (vocab[-1][0], vocab[-1][1]) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict([(w, i) for i, w in enumerate(self.index_to_word)]) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [ w if w in self.word_to_index else self.unknown_token for w in sent ] # create training data x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print "" print "Example sentence: '%s'" % tokenized_sentences[0] print "By word indexes: '%s'" % \ [self.word_to_index[w] for w in tokenized_sentences[0]] return (x_train, y_train) def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim=self.vocabulary_size, hidden_dim=100, bptt_truncate=4) self.model.sgd(x_train, y_train, 0.01, iterations) def generate_sentence(self): # repeat until we get an end token sentence_start_idx = self.word_to_index[self.sentence_start_token] sentence_end_idx = self.word_to_index[self.sentence_end_token] unknown_word_idx = self.word_to_index[self.unknown_token] # start the sentence with the start token new_sentence = [sentence_start_idx] while new_sentence[-1] != sentence_end_idx: next_word_probs = self.model.forward_propagation(new_sentence) sampled_word = unknown_word_idx # skip unknown words while sampled_word == unknown_word_idx or \ sampled_word == sentence_start_idx: samples = np.random.multinomial(1, next_word_probs[0]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) return new_sentence def generate_sentences(self, num_sentences, min_length): for i in xrange(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < min_length: sent = self.generate_sentence() sent_str = [self.index_to_word[x] for x in sent[1:-1]] print " ".join(sent_str).encode('utf-8') print ""
class RNNLM: def __init__(self): self.unknown_token = "UNKNOWN_TOKEN" self.sentence_start_token = "SENTENCE_START" self.sentence_end_token = "SENTENCE_END" self.index_to_word = None self.word_to_index = None self.model = None def tokenize_data(self, n = -1): # download dependent nltk resources if you havn't. # nltk.download('punkt') # Read the data and append SENTENCE_START and SENTENCE_END tokens print "Reading sentences from gutenberg corpus ..." from nltk.corpus import gutenberg tokenized_sentences = [] for s in gutenberg.sents('austen-emma.txt'): tokenized_sentences.append([self.sentence_start_token] + s[1:-1] + [self.sentence_end_token]) print "Parsed %d sentences." % (len(tokenized_sentences)) if n > 0: tokenized_sentences = tokenized_sentences[:n] # count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) self.vocabulary_size = int(len(word_freq.items()) * 0.95) # get the most common words, treat others words as unknown. vocab = word_freq.most_common(self.vocabulary_size - 1) print "Using vocabulary size %d." % self.vocabulary_size print "The least frequent word is '%s' and appeared %d times." % \ (vocab[-1][0], vocab[-1][1]) self.index_to_word = [x[0] for x in vocab] self.index_to_word.append(self.unknown_token) self.word_to_index = dict([(w,i) for i,w in enumerate(self.index_to_word)]) # replace all words not in our vocabulary with the unknown token for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in self.word_to_index else self.unknown_token for w in sent] # create training data x_train = np.asarray([[self.word_to_index[w] for w in sent[:-1]] for sent in tokenized_sentences]) y_train = np.asarray([[self.word_to_index[w] for w in sent[1:]] for sent in tokenized_sentences]) print "" print "Example sentence: '%s'" % tokenized_sentences[0] print "By word indexes: '%s'" % \ [self.word_to_index[w] for w in tokenized_sentences[0]] return (x_train, y_train) def train_numpy(self, x_train, y_train, iterations): self.model = RNNNumpy(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def train_lstm_theano(self, x_train, y_train, iterations): self.model = RNNTheano(word_dim = self.vocabulary_size, hidden_dim = 100, bptt_truncate = 4) self.model.sgd(x_train, y_train, 0.01, iterations) def generate_sentence(self): # repeat until we get an end token sentence_start_idx = self.word_to_index[self.sentence_start_token] sentence_end_idx = self.word_to_index[self.sentence_end_token] unknown_word_idx = self.word_to_index[self.unknown_token] # start the sentence with the start token new_sentence = [sentence_start_idx] while new_sentence[-1] != sentence_end_idx: next_word_probs = self.model.forward_propagation(new_sentence) sampled_word = unknown_word_idx # skip unknown words while sampled_word == unknown_word_idx or \ sampled_word == sentence_start_idx: samples = np.random.multinomial(1, next_word_probs[0]) sampled_word = np.argmax(samples) new_sentence.append(sampled_word) return new_sentence def generate_sentences(self, num_sentences, min_length): for i in xrange(num_sentences): sent = [] # We want long sentences, not sentences with one or two words while len(sent) < min_length: sent = self.generate_sentence() sent_str = [self.index_to_word[x] for x in sent[1:-1]] print " ".join(sent_str).encode('utf-8') print ""