def test_session(self, session, inds): #for debugging purposes only #take random sample of training set: # inds = np.random.choice(self.training_set, math.floor(0.01*len(self.training_set)), replace=False) small_s = generate_padded_seq(self.config.max_length, self.config.output_size, inds) small_s_y = [i[1:] for i in small_s] seq_len = [len(i) for i in inds] masks = get_masks(inds, self.config.max_length) feed = self.create_feed_dict(inputs_batch= small_s, labels_batch= small_s_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len) loss = session.run([self.loss], feed_dict=feed) return loss
def train_on_batch(self, sess, batch): batch_x = generate_padded_seq(self.config.max_length, self.config.output_size, batch) batch_y = [i[1:] for i in batch_x] seq_len = [len(i) for i in batch] masks = get_masks(batch, self.config.max_length) feed = self.create_feed_dict(inputs_batch=batch_x, labels_batch= batch_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len) _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) # loss = sess.run(self.error, feed_dict=feed) # pred = sess.run(self.probs, feed_dict=feed) # return pred return _, loss
def train(args): n_epochs = 30 embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path='./data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit sample = np.array([get_indices(j) for j in all_dat[r]]) # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100] #seq_length, max_length, embed_size, output_size max_length = max([len(se) for se in sample]) config_file = Config(max_length=max_length, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=128, drop_out=args.dropout, learning_rate=args.learningrate, hidden_unit_size=args.hiddensize) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets( dev_size=config_file.dev_set_size, test_size=config_file.test_set_size, training_indices=idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=config_file) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) m.run_epoch(sess, np.array(train)) # # #evaluate training perplexity test_size = len(dev) total_perplexity = 0 total_batches = 0 for k, indices in enumerate(get_batch(test_size, 100)): total_batches += 1 test_batch = dev[indices] masks = get_masks(test_batch, config_file.max_length) seq_len = [len(i) for i in test_batch] batch_x = generate_padded_seq(config_file.max_length, config_file.output_size, test_batch) batch_y = [i[1:] for i in batch_x] feed = m.create_feed_dict(inputs_batch=batch_x, labels_batch=batch_y, dropout=config_file.drop_out, mask_batch=masks, seq_length=seq_len) perplexities = sess.run(m.error, feed_dict=feed) total_perplexity += perplexities # seq_inds = np.arange(len(seq_len)) # print "Average Perplexity Across Entire Set: " + str(sum([np.prod(perplexities[i][0:seq_len[i]])**(-1/seq_len[i]) for i in seq_inds])/len(seq_inds)) print "Epoch: " + str( epoch + 1) + " average test perplexity for batch " + str( k + 1) + ':' + str(perplexities) if (total_perplexity / total_batches) < best_perplexity: best_perplexity = (total_perplexity / total_batches) print "New Best Perplexity: " + str(best_perplexity) saver.save( sess, "./code/trainer/models/" + r + "/epoch_" + str(epoch + 1) + ".ckpt") with open('./code/trainer/diag/diagnostics.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([ args.subreddit, str(best_perplexity), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size) ])
feed = self.create_feed_dict(inputs_batch=batch_x, labels_batch= batch_y, dropout= self.config.drop_out, mask_batch=masks, seq_length = seq_len) _, loss = sess.run([self.train_op, self.loss], feed_dict=feed) # loss = sess.run(self.error, feed_dict=feed) # pred = sess.run(self.probs, feed_dict=feed) # return pred return _, loss def train_on_batch_single(self, sess, batch): total_loss = 0. max_len = max(len(case) for case in batch) padded = generate_padded_seq(max_len, self.config.output_size, batch) masks = np.matrix(get_masks(batch, max_len)) batch_x = [i[:-1] for i in padded] batch_y = [i[1:] for i in padded] #make the batches into a matrix so that we can have easier time feeding batch_x_mat = np.matrix(batch_x) batch_y_mat = np.matrix(batch_y) assert batch_x_mat.shape[1] == batch_y_mat.shape[1], "x and y are not the same length. x: " +str(batch_x_mat.shape[1]) + ". y: " + str(batch_y_mat.shape[1]) for i in range(batch_x_mat.shape[1]): x = batch_x_mat[:,i] y = batch_y_mat[:,i] m = masks[:,i]
def train(args): n_epochs = 100 embeddings = get_embeddings( embed_path='./data/new_embeddings_final_filtered.pkl') # embeddings = np.load('./data/final_large_weights.npy') # embeddings = np.vstack([embeddings, np.zeros(embeddings.shape[1])]) all_dat = collections.defaultdict(list) raw_data = get_data(path='./data/2015_data_tokenzed.pkl') for r, post in raw_data: all_dat[r].append(post) # vocabs = collections.defaultdict(str) # with open('./data/large_vocab') as csvfile: # vocab = csv.reader(csvfile) # for v in vocab: # vocabs[v[1]] = v[0] #get vocab: with open('./data/large_vocab_final_filtered.pkl', 'rb') as f: vocabs = cPickle.load(f) f.close() vocabs = collections.defaultdict(str, vocabs) def get_indices(sent): return [vocabs[i] for i in sent] vocabs_reversed = {v: k for k, v in vocabs.iteritems()} def get_words(sent): return [vocabs_reversed[i] for i in sent] r = args.subreddit sample = np.array([get_indices(j) for j in all_dat[r]]) # subsample_y = [get_indices(j) for j[1:] in all_dat['personalfinance']][0:100] max_length = max(len(i) for i in sample) #seq_length, max_length, embed_size, output_size config_file = Config(drop_out=args.dropout, max_length=max_length, embed_size=embeddings.shape[1], output_size=embeddings.shape[0], batch_size=256, learning_rate=args.learningrate, hidden_unit_size=args.hiddensize, num_layers=args.numlayers, sequence_length=args.seqlength, peepholes=args.peephole) idx = np.arange(len(sample)) train_inds, dev_inds, test_inds = get_dev_test_sets( dev_size=config_file.dev_set_size, test_size=config_file.test_set_size, training_indices=idx) train, dev, test = sample[train_inds], sample[dev_inds], sample[test_inds] with tf.Graph().as_default(): m = RNN_LSTM(embeddings=embeddings, config=config_file) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) # loss = m.test_session(sess, train) best_perplexity = np.inf for epoch in range(n_epochs): print "Epoch: " + str(epoch + 1) m.run_epoch(sess, np.array(train)) # # #evaluate test perplexity test_size = len(dev) total_perplexity = 0 total_batches = 0 for k, indices in enumerate(get_batch(test_size, 100)): total_batches += 1 test_batch = dev[indices] max_len = max(len(case) for case in test_batch) padded = generate_padded_seq(max_len, config_file.output_size, test_batch) masks = np.matrix(get_masks(test_batch, max_len)) batch_x = [i[:-1] for i in padded] batch_y = [i[1:] for i in padded] batch_x_mat = np.matrix(batch_x) batch_y_mat = np.matrix(batch_y) # batch_perplexity = 0 batch_loss = 0. sequences = get_sequence( max_len, sequence_length=config_file.sequence_length) for bat in sequences: x = batch_x_mat[:, bat] y = batch_y_mat[:, bat] batch_mask = masks[:, bat] feed = m.create_feed_dict(inputs_batch=x, labels_batch=y, dropout=config_file.drop_out, mask_batch=batch_mask, seq_length=[1] * len(test_batch)) loss = sess.run(m.loss, feed_dict=feed) # perplexities = sess.run(m.error, feed_dict=feed) # print "Single word-pair perplexity: " + str(perplexities) batch_loss += loss batch_loss = batch_loss / len(sequences) batch_perplexity = batch_loss**2 total_perplexity += batch_perplexity print "Epoch " + str( epoch + 1) + " Total test perplexity for batch " + str( k + 1) + ' :' + str(batch_perplexity) if total_perplexity < best_perplexity: best_perplexity = total_perplexity print "New Best Perplexity: " + str(best_perplexity) saver.save( sess, "./code/trainer/models/" + r.lower() + "/single_epoch_" + str(epoch + 1) + ".ckpt") with open('./code/trainer/diag/diagnostics_new_final.csv', 'a') as diag_out: csv_diag_out = csv.writer(diag_out) csv_diag_out.writerow([ args.subreddit, str(config_file.peephole), str(best_perplexity), str(config_file.drop_out), str(config_file.hidden_unit_size), str(config_file.learning_rate), str(config_file.embed_size), str(config_file.sequence_length) ])