def test_vectorize_docs(): docs = load_line_corpus(CURDIR + "/data/corpus.txt") mat, vocab = vectorize_docs(docs) print vocab assert_equal(vocab[0], u"product") assert_equal(vocab[1], u"right") assert_equal(len(vocab), 14) assert_equal(len(mat), 2) for doc, doc_m in zip(docs, mat): assert_equal([vocab[w] for w in doc_m], doc)
def test_load_line_corpus(): docs = load_line_corpus(CURDIR + "/data/corpus.txt") assert_equal(len(docs), 2) assert_equal( docs[0], [ u"product", u"defin", u"number", u"column", u"figur", u"right", u"illustr", u"diagrammat", u"product", u"two", u"matric", ], )
def test_doc2term_matrix(): docs = load_line_corpus(CURDIR + "/data/corpus.txt") mat = doc2term_matrix(docs) assert_equal(mat.shape, (2, 14))