Esempio n. 1
0
 def test_documents_to_vectors(self):
     n_cat = len(set(self.targets))
     max_w = mg.max_words(self.docs)
     vectors = mg.as_vectors_from_dtm(self.docs, self.targets, n_cat * max_w)
     self.assertEqual(len(vectors), len(self.docs))
     for vec in vectors:
         self.assertEquals(len(vec), n_cat * max_w)
Esempio n. 2
0
 def test_vec_per_category(self):
     cat_to_vec, vec, X = mg.vec_per_category(self.docs, self.targets)
     print(cat_to_vec)
     self.assertEquals(len(cat_to_vec), self.n_categories)
     for i in cat_to_vec:
         row = cat_to_vec[i]
         print("i", i, "row", row)
         self.assertEqual(len(row), len(vec.get_feature_names()))
         self.assertEqual(len(row), len(set(mg.cleaned_docs(self.words))))
Esempio n. 3
0
def plot_accuracy_vs_data():
    epoch = log_every
    n_features = 20
    # (sparse_tfidf_texts, targets) = util.do_tf_idf(n_features)
    (sparse_tfidf_texts, targets) = util.do_term_cat()

    output_size = len(util.subjects)

    (x, out, y, dropout_keep_prob) = util.neural_net(n_features, output_size)

    n = 16
    total = []
    test = []
    train = []
    for i in range(n - 1):
        i += 1
        print("i", i)
        pc_of_data = float(i) / float(n)
        indices = np.random.choice(sparse_tfidf_texts.shape[0],
                                   round(pc_of_data *
                                         sparse_tfidf_texts.shape[0]),
                                   replace=False)
        print(indices)
        doc_vec_sample = sparse_tfidf_texts[indices]
        target_sample = np.array(targets)[indices]
        (test_accs, train_accs,
         total_accs) = train_and_test_in_batches(x, out, y, doc_vec_sample,
                                                 target_sample, epoch,
                                                 dropout_keep_prob)
        print(test_accs, train_accs, total_accs)
        total.extend(total_accs)
        test.extend(test_accs)
        train.extend(train_accs)

    xs = [float(i) / n for i in range(len(total))]
    print("xs", xs)
    print("total", total)
    plt.plot(xs, total)
    plt.plot(xs, test, 'o', c='b')
    plt.plot(xs, train, 'o', c='r')
    plt.ylabel('accuracy')
    plt.xlabel('fraction of all data')
    plt.show()
Esempio n. 4
0
def train_and_test(nn_init_fn, epoch):
    n_features = len(util.subjects)
    # (sparse_tfidf_texts, targets) = util.do_tf_idf(n_features)
    (sparse_tfidf_texts, targets) = util.do_term_cat()

    output_size = len(util.subjects)

    (x, out, y) = nn_init_fn(n_features, output_size)
    return train_and_test_in_batches(x, out, y, sparse_tfidf_texts, targets,
                                     epoch)  #, dropout_keep_prob)
Esempio n. 5
0
 def test_to_csr(self):
     max_vec_size = mg.max_words(self.docs) * len(self.docs)
     doc_vectors, n_features = mg.to_csr(self.docs, self.targets, max_vec_size)
     self.assertEqual(doc_vectors.shape[0], len(self.docs))
     self.assertEqual(doc_vectors.shape[1], max_vec_size)
     self.assertEqual(n_features, len(set(mg.cleaned_docs(self.words))))
Esempio n. 6
0
class MyTestCase(unittest.TestCase):
    docs = ['why hello there', 'omg hello pony', 'she went there? omg']
    words = [w for doc in docs for w in doc.split(' ')]
    cleaned_words = set(map(lambda x: mg.cleaned(x), words))
    print(words)
    targets = [0, 1, 0]
    categories = set(targets)
    n_categories = len(categories)

    def test_stopword_removal(self):
        lines = mg.remove_stopwords(self.docs)
        self.assertEqual(len(self.docs), len(lines))
        print("lines", lines)
        self.assertTrue("hello" in lines)

    def test_word_to_cat_vector(self):
        word2vec, n_features = mg.word_to_cat_vector(self.docs, self.targets)
        for word in mg.cleaned_docs(self.words):
            v = word2vec[word]
            self.assertEqual(len(v), self.n_categories)

    def test_to_csr(self):
        max_vec_size = mg.max_words(self.docs) * len(self.docs)
        doc_vectors, n_features = mg.to_csr(self.docs, self.targets, max_vec_size)
        self.assertEqual(doc_vectors.shape[0], len(self.docs))
        self.assertEqual(doc_vectors.shape[1], max_vec_size)
        self.assertEqual(n_features, len(set(mg.cleaned_docs(self.words))))

    def test_docs_to_vecs(self):
        max_vec_size = mg.max_words(self.docs) * len(self.docs)
        vecs, n_features = mg.docs_to_vecs(self.docs, self.targets, max_vec_size)
        self.assertEqual(len(vecs), len(self.docs))
        for v in vecs:
            self.assertEqual(len(v), max_vec_size)

    def test_vec_per_category(self):
        cat_to_vec, vec, X = mg.vec_per_category(self.docs, self.targets)
        print(cat_to_vec)
        self.assertEquals(len(cat_to_vec), self.n_categories)
        for i in cat_to_vec:
            row = cat_to_vec[i]
            print("i", i, "row", row)
            self.assertEqual(len(row), len(vec.get_feature_names()))
            self.assertEqual(len(row), len(set(mg.cleaned_docs(self.words))))

    def test_cleaned(self):
        self.assertEqual(mg.cleaned("she went there? omg"), "she went there omg")

    def test_pad_list(self):
        self.assertEqual(len(mg.pad_with_zeros_or_truncate([1] * 3, 5)), 5)

    def test_pad_empty(self):
        xs = []
        truncated = mg.pad_with_zeros_or_truncate(xs, 3)
        self.assertEqual(len(truncated), 3)

    def test_tuncate_list(self):
        xs = [1] * 10
        truncated = mg.pad_with_zeros_or_truncate(xs, 3)
        self.assertEqual(len(truncated), 3)

    def truncate_first(self):
        xs = [1, 2, 3, 4, 5]
        truncated = mg.pad_with_zeros_or_truncate(xs, 3)
        self.assertEqual(truncated, [1, 2, 3])

    def test_leave_list_the_right_size_alone(self):
        xs = [1] * 5
        self.assertEqual(mg.pad_with_zeros_or_truncate(xs, 5), xs)

    def test_max_words(self):
        self.assertEqual(mg.max_words(self.docs), 5)  # grammar counts

    def test_categories(self):
        agg = mg.as_categories(self.docs, self.targets)
        self.assertEqual(len(agg), self.n_categories)

    def test_line_per_category(self):
        cats = mg.line_per_category(self.docs, self.targets)
        print(cats)
        self.assertEqual(len(cats), self.n_categories)

    def test_documents_to_vectors(self):
        n_cat = len(set(self.targets))
        max_w = mg.max_words(self.docs)
        vectors = mg.as_vectors_from_dtm(self.docs, self.targets, n_cat * max_w)
        self.assertEqual(len(vectors), len(self.docs))
        for vec in vectors:
            self.assertEquals(len(vec), n_cat * max_w)
Esempio n. 7
0
 def test_categories(self):
     agg = mg.as_categories(self.docs, self.targets)
     self.assertEqual(len(agg), self.n_categories)
Esempio n. 8
0
 def test_line_per_category(self):
     cats = mg.line_per_category(self.docs, self.targets)
     print(cats)
     self.assertEqual(len(cats), self.n_categories)
Esempio n. 9
0
 def test_leave_list_the_right_size_alone(self):
     xs = [1] * 5
     self.assertEqual(mg.pad_with_zeros_or_truncate(xs, 5), xs)
Esempio n. 10
0
 def test_max_words(self):
     self.assertEqual(mg.max_words(self.docs), 5)  # grammar counts
Esempio n. 11
0
 def test_word_to_cat_vector(self):
     word2vec, n_features = mg.word_to_cat_vector(self.docs, self.targets)
     for word in mg.cleaned_docs(self.words):
         v = word2vec[word]
         self.assertEqual(len(v), self.n_categories)
Esempio n. 12
0
 def truncate_first(self):
     xs = [1, 2, 3, 4, 5]
     truncated = mg.pad_with_zeros_or_truncate(xs, 3)
     self.assertEqual(truncated, [1, 2, 3])
Esempio n. 13
0
def train_and_test_in_batches(x, out, y, sparse_tfidf_texts, targets, epoch):
    (optimizer, loss) = util.optimiser_loss(out, y)

    accuracy = util.accuracy_fn(out, y)

    all_test = range(sparse_tfidf_texts.shape[0])

    test_accs = []
    train_accs = []
    total_accs = []

    p_dropout = 0.9  # 1.0->88.8% accuracy. 0.8->68%, 0.95->81%, 0.85->71.9%, 0.9->76.5%, 0.97->83.3%, 0.99->84.7%, 1.0->85.3%

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(epoch):
            i_batch = 0
            total_batch_train_acc = 0.0
            total_batch_test_acc = 0.0
            total_batch_train_loss = 0.0
            total_batch_test_loss = 0.0
            testing_training = test_train_indices(len(targets), 128, 1.0)
            for (test_indices, train_indices) in testing_training:
                rand_index = train_indices
                rand_x = sparse_tfidf_texts[rand_index]  #.todense()
                rand_y = util.one_hot(rand_index, out.shape[1], targets)
                f_dict = {x: rand_x, y: rand_y}
                train_loss, _, train_acc = sess.run(
                    [loss, optimizer, accuracy], feed_dict=f_dict)
                if i % log_every == log_every - 1:
                    f_dict_test = {
                        x: sparse_tfidf_texts[test_indices],  #.todense(),
                        y: util.one_hot(test_indices, out.shape[1], targets)
                    }
                    test_acc = sess.run(accuracy, feed_dict=f_dict_test)
                    test_loss = sess.run(loss, feed_dict=f_dict_test)
                    i_batch = i_batch + 1
                    total_batch_test_acc += test_acc
                    total_batch_train_acc += train_acc
                    total_batch_train_loss += train_loss
                    total_batch_test_loss += test_loss
            if i % log_every == log_every - 1:
                print("\nEpoch %d " % i)
                test_acc = (total_batch_test_acc / i_batch)
                train_acc = (total_batch_train_acc / i_batch)
                print("Average test accuracy ", test_acc)
                print("Average train accuracy", train_acc)
                print("Average test loss     ",
                      (total_batch_test_loss / i_batch))
                print("Average train loss    ",
                      (total_batch_train_loss / i_batch))
                acc = sess.run(
                    accuracy,
                    feed_dict={
                        x: sparse_tfidf_texts[all_test],  #.todense(),
                        y: util.one_hot(all_test, out.shape[1], targets)
                    })
                print("batch accuracy        ", acc)
                test_accs.append(test_acc)
                train_accs.append(train_acc)
                total_accs.append(acc)
    return test_accs, train_accs, total_accs
Esempio n. 14
0
 def test_pad_empty(self):
     xs = []
     truncated = mg.pad_with_zeros_or_truncate(xs, 3)
     self.assertEqual(len(truncated), 3)
Esempio n. 15
0
 def test_pad_list(self):
     self.assertEqual(len(mg.pad_with_zeros_or_truncate([1] * 3, 5)), 5)
Esempio n. 16
0
 def test_cleaned(self):
     self.assertEqual(mg.cleaned("she went there? omg"), "she went there omg")
Esempio n. 17
0
 def test_docs_to_vecs(self):
     max_vec_size = mg.max_words(self.docs) * len(self.docs)
     vecs, n_features = mg.docs_to_vecs(self.docs, self.targets, max_vec_size)
     self.assertEqual(len(vecs), len(self.docs))
     for v in vecs:
         self.assertEqual(len(v), max_vec_size)
Esempio n. 18
0
 def test_tuncate_list(self):
     xs = [1] * 10
     truncated = mg.pad_with_zeros_or_truncate(xs, 3)
     self.assertEqual(len(truncated), 3)
Esempio n. 19
0
 def test_stopword_removal(self):
     lines = mg.remove_stopwords(self.docs)
     self.assertEqual(len(self.docs), len(lines))
     print("lines", lines)
     self.assertTrue("hello" in lines)