def test_documents_to_vectors(self): n_cat = len(set(self.targets)) max_w = mg.max_words(self.docs) vectors = mg.as_vectors_from_dtm(self.docs, self.targets, n_cat * max_w) self.assertEqual(len(vectors), len(self.docs)) for vec in vectors: self.assertEquals(len(vec), n_cat * max_w)
def test_vec_per_category(self): cat_to_vec, vec, X = mg.vec_per_category(self.docs, self.targets) print(cat_to_vec) self.assertEquals(len(cat_to_vec), self.n_categories) for i in cat_to_vec: row = cat_to_vec[i] print("i", i, "row", row) self.assertEqual(len(row), len(vec.get_feature_names())) self.assertEqual(len(row), len(set(mg.cleaned_docs(self.words))))
def plot_accuracy_vs_data(): epoch = log_every n_features = 20 # (sparse_tfidf_texts, targets) = util.do_tf_idf(n_features) (sparse_tfidf_texts, targets) = util.do_term_cat() output_size = len(util.subjects) (x, out, y, dropout_keep_prob) = util.neural_net(n_features, output_size) n = 16 total = [] test = [] train = [] for i in range(n - 1): i += 1 print("i", i) pc_of_data = float(i) / float(n) indices = np.random.choice(sparse_tfidf_texts.shape[0], round(pc_of_data * sparse_tfidf_texts.shape[0]), replace=False) print(indices) doc_vec_sample = sparse_tfidf_texts[indices] target_sample = np.array(targets)[indices] (test_accs, train_accs, total_accs) = train_and_test_in_batches(x, out, y, doc_vec_sample, target_sample, epoch, dropout_keep_prob) print(test_accs, train_accs, total_accs) total.extend(total_accs) test.extend(test_accs) train.extend(train_accs) xs = [float(i) / n for i in range(len(total))] print("xs", xs) print("total", total) plt.plot(xs, total) plt.plot(xs, test, 'o', c='b') plt.plot(xs, train, 'o', c='r') plt.ylabel('accuracy') plt.xlabel('fraction of all data') plt.show()
def train_and_test(nn_init_fn, epoch): n_features = len(util.subjects) # (sparse_tfidf_texts, targets) = util.do_tf_idf(n_features) (sparse_tfidf_texts, targets) = util.do_term_cat() output_size = len(util.subjects) (x, out, y) = nn_init_fn(n_features, output_size) return train_and_test_in_batches(x, out, y, sparse_tfidf_texts, targets, epoch) #, dropout_keep_prob)
def test_to_csr(self): max_vec_size = mg.max_words(self.docs) * len(self.docs) doc_vectors, n_features = mg.to_csr(self.docs, self.targets, max_vec_size) self.assertEqual(doc_vectors.shape[0], len(self.docs)) self.assertEqual(doc_vectors.shape[1], max_vec_size) self.assertEqual(n_features, len(set(mg.cleaned_docs(self.words))))
class MyTestCase(unittest.TestCase): docs = ['why hello there', 'omg hello pony', 'she went there? omg'] words = [w for doc in docs for w in doc.split(' ')] cleaned_words = set(map(lambda x: mg.cleaned(x), words)) print(words) targets = [0, 1, 0] categories = set(targets) n_categories = len(categories) def test_stopword_removal(self): lines = mg.remove_stopwords(self.docs) self.assertEqual(len(self.docs), len(lines)) print("lines", lines) self.assertTrue("hello" in lines) def test_word_to_cat_vector(self): word2vec, n_features = mg.word_to_cat_vector(self.docs, self.targets) for word in mg.cleaned_docs(self.words): v = word2vec[word] self.assertEqual(len(v), self.n_categories) def test_to_csr(self): max_vec_size = mg.max_words(self.docs) * len(self.docs) doc_vectors, n_features = mg.to_csr(self.docs, self.targets, max_vec_size) self.assertEqual(doc_vectors.shape[0], len(self.docs)) self.assertEqual(doc_vectors.shape[1], max_vec_size) self.assertEqual(n_features, len(set(mg.cleaned_docs(self.words)))) def test_docs_to_vecs(self): max_vec_size = mg.max_words(self.docs) * len(self.docs) vecs, n_features = mg.docs_to_vecs(self.docs, self.targets, max_vec_size) self.assertEqual(len(vecs), len(self.docs)) for v in vecs: self.assertEqual(len(v), max_vec_size) def test_vec_per_category(self): cat_to_vec, vec, X = mg.vec_per_category(self.docs, self.targets) print(cat_to_vec) self.assertEquals(len(cat_to_vec), self.n_categories) for i in cat_to_vec: row = cat_to_vec[i] print("i", i, "row", row) self.assertEqual(len(row), len(vec.get_feature_names())) self.assertEqual(len(row), len(set(mg.cleaned_docs(self.words)))) def test_cleaned(self): self.assertEqual(mg.cleaned("she went there? omg"), "she went there omg") def test_pad_list(self): self.assertEqual(len(mg.pad_with_zeros_or_truncate([1] * 3, 5)), 5) def test_pad_empty(self): xs = [] truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(len(truncated), 3) def test_tuncate_list(self): xs = [1] * 10 truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(len(truncated), 3) def truncate_first(self): xs = [1, 2, 3, 4, 5] truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(truncated, [1, 2, 3]) def test_leave_list_the_right_size_alone(self): xs = [1] * 5 self.assertEqual(mg.pad_with_zeros_or_truncate(xs, 5), xs) def test_max_words(self): self.assertEqual(mg.max_words(self.docs), 5) # grammar counts def test_categories(self): agg = mg.as_categories(self.docs, self.targets) self.assertEqual(len(agg), self.n_categories) def test_line_per_category(self): cats = mg.line_per_category(self.docs, self.targets) print(cats) self.assertEqual(len(cats), self.n_categories) def test_documents_to_vectors(self): n_cat = len(set(self.targets)) max_w = mg.max_words(self.docs) vectors = mg.as_vectors_from_dtm(self.docs, self.targets, n_cat * max_w) self.assertEqual(len(vectors), len(self.docs)) for vec in vectors: self.assertEquals(len(vec), n_cat * max_w)
def test_categories(self): agg = mg.as_categories(self.docs, self.targets) self.assertEqual(len(agg), self.n_categories)
def test_line_per_category(self): cats = mg.line_per_category(self.docs, self.targets) print(cats) self.assertEqual(len(cats), self.n_categories)
def test_leave_list_the_right_size_alone(self): xs = [1] * 5 self.assertEqual(mg.pad_with_zeros_or_truncate(xs, 5), xs)
def test_max_words(self): self.assertEqual(mg.max_words(self.docs), 5) # grammar counts
def test_word_to_cat_vector(self): word2vec, n_features = mg.word_to_cat_vector(self.docs, self.targets) for word in mg.cleaned_docs(self.words): v = word2vec[word] self.assertEqual(len(v), self.n_categories)
def truncate_first(self): xs = [1, 2, 3, 4, 5] truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(truncated, [1, 2, 3])
def train_and_test_in_batches(x, out, y, sparse_tfidf_texts, targets, epoch): (optimizer, loss) = util.optimiser_loss(out, y) accuracy = util.accuracy_fn(out, y) all_test = range(sparse_tfidf_texts.shape[0]) test_accs = [] train_accs = [] total_accs = [] p_dropout = 0.9 # 1.0->88.8% accuracy. 0.8->68%, 0.95->81%, 0.85->71.9%, 0.9->76.5%, 0.97->83.3%, 0.99->84.7%, 1.0->85.3% with tf.Session() as sess: sess.run(tf.global_variables_initializer()) for i in range(epoch): i_batch = 0 total_batch_train_acc = 0.0 total_batch_test_acc = 0.0 total_batch_train_loss = 0.0 total_batch_test_loss = 0.0 testing_training = test_train_indices(len(targets), 128, 1.0) for (test_indices, train_indices) in testing_training: rand_index = train_indices rand_x = sparse_tfidf_texts[rand_index] #.todense() rand_y = util.one_hot(rand_index, out.shape[1], targets) f_dict = {x: rand_x, y: rand_y} train_loss, _, train_acc = sess.run( [loss, optimizer, accuracy], feed_dict=f_dict) if i % log_every == log_every - 1: f_dict_test = { x: sparse_tfidf_texts[test_indices], #.todense(), y: util.one_hot(test_indices, out.shape[1], targets) } test_acc = sess.run(accuracy, feed_dict=f_dict_test) test_loss = sess.run(loss, feed_dict=f_dict_test) i_batch = i_batch + 1 total_batch_test_acc += test_acc total_batch_train_acc += train_acc total_batch_train_loss += train_loss total_batch_test_loss += test_loss if i % log_every == log_every - 1: print("\nEpoch %d " % i) test_acc = (total_batch_test_acc / i_batch) train_acc = (total_batch_train_acc / i_batch) print("Average test accuracy ", test_acc) print("Average train accuracy", train_acc) print("Average test loss ", (total_batch_test_loss / i_batch)) print("Average train loss ", (total_batch_train_loss / i_batch)) acc = sess.run( accuracy, feed_dict={ x: sparse_tfidf_texts[all_test], #.todense(), y: util.one_hot(all_test, out.shape[1], targets) }) print("batch accuracy ", acc) test_accs.append(test_acc) train_accs.append(train_acc) total_accs.append(acc) return test_accs, train_accs, total_accs
def test_pad_empty(self): xs = [] truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(len(truncated), 3)
def test_pad_list(self): self.assertEqual(len(mg.pad_with_zeros_or_truncate([1] * 3, 5)), 5)
def test_cleaned(self): self.assertEqual(mg.cleaned("she went there? omg"), "she went there omg")
def test_docs_to_vecs(self): max_vec_size = mg.max_words(self.docs) * len(self.docs) vecs, n_features = mg.docs_to_vecs(self.docs, self.targets, max_vec_size) self.assertEqual(len(vecs), len(self.docs)) for v in vecs: self.assertEqual(len(v), max_vec_size)
def test_tuncate_list(self): xs = [1] * 10 truncated = mg.pad_with_zeros_or_truncate(xs, 3) self.assertEqual(len(truncated), 3)
def test_stopword_removal(self): lines = mg.remove_stopwords(self.docs) self.assertEqual(len(self.docs), len(lines)) print("lines", lines) self.assertTrue("hello" in lines)