Ejemplo n.º 1
0
def test_get_vocabulary():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')
    
    doc_list = [doc1, doc2]
    doc_col = parse.DocumentCollection(doc_list, [])
    vocab = parse.get_vocabulary(doc_col.doc_list)
    assert len(vocab) == 10
Ejemplo n.º 2
0
def test_compute_IDF():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')

    doc_list = [doc1, doc2]
    vocab = parse.get_vocabulary(doc_list)
    doc_col = parse.DocumentCollection(doc_list, vocab)

    idf_table = vector.compute_IDF(doc_col.doc_list, doc_col.vocab)
    assert idf_table[1] == 0.6931471805599453
    assert idf_table[0] == 0
Ejemplo n.º 3
0
def test_Vector():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')

    doc_list = [doc1, doc2]
    vocab = parse.get_vocabulary(doc_list)
    doc_col = parse.DocumentCollection(doc_list, vocab)

    v = vector.VectorCollection((parse.DocumentCollection(doc_list, vocab)))
    v.compute_TF_IDF()
    assert v.tf_idf_table[0][1] == 1.3862943611198906
Ejemplo n.º 4
0
def test_compute_TF():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')

    doc_list = [doc1, doc2]
    vocab = parse.get_vocabulary(doc_list)
    doc_col = parse.DocumentCollection(doc_list, vocab)

    tf_table = vector.compute_TF(doc_col.doc_list, doc_col.vocab)
    term = 1
    assert tf_table[0][term] == 2.0
    assert tf_table[1][term] == 0
Ejemplo n.º 5
0
def test_compute_TF_IDF():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')

    doc_list = [doc1, doc2]
    vocab = parse.get_vocabulary(doc_list)
    doc_col = parse.DocumentCollection(doc_list, vocab)

    tf_table = vector.compute_TF(doc_col.doc_list, doc_col.vocab)
    idf_table = vector.compute_IDF(doc_col.doc_list, doc_col.vocab)
    tf_idf_table = vector.compute_TF_IDF(tf_table, idf_table)
    assert tf_idf_table[0][1] == 1.3862943611198906
    assert tf_idf_table.shape[1] == len(vocab)
Ejemplo n.º 6
0
def test_Vector_file():
    doc1 = raw1.split(' ')
    doc2 = raw2.split(' ')

    doc_list = [doc1, doc2]
    vocab = parse.get_vocabulary(doc_list)
    doc_col = parse.DocumentCollection(doc_list, vocab)

    v1 = vector.VectorCollection(parse.DocumentCollection(doc_list, vocab))
    v1.compute_TF_IDF()
    v2 = vector.VectorCollection(parse.DocumentCollection([], []))

    folder = '__temp__'
    v1.save_to_dir(folder)
    v2.load_from_dir(folder)
    shutil.rmtree('__temp__')
    assert (v1.tf_idf_table == v2.tf_idf_table).all()