Esempio n. 1
0
def test_get_doc_lengths_type(vectorizer_and_dtm):
    _, doc_term_matrix = vectorizer_and_dtm
    dls = vsm.get_doc_lengths(doc_term_matrix, type_='linear')
    dls_sqrt = vsm.get_doc_lengths(doc_term_matrix, type_='sqrt')
    dls_log = vsm.get_doc_lengths(doc_term_matrix, type_='log')
    assert len(dls) == len(dls_sqrt) == len(dls_log) == doc_term_matrix.shape[0]
    assert (dls_sqrt == np.sqrt(dls)).all()
    assert (dls_log == np.log(dls) + 1.0).all()
Esempio n. 2
0
def test_get_doc_lengths_exception(vectorizer_and_dtm):
    _, doc_term_matrix = vectorizer_and_dtm
    with pytest.raises(ValueError):
        _ = vsm.get_doc_lengths(doc_term_matrix, type_='foo')
Esempio n. 3
0
def test_get_doc_lengths(tokenized_docs, vectorizer_and_dtm):
    _, doc_term_matrix = vectorizer_and_dtm
    dls = vsm.get_doc_lengths(doc_term_matrix, type_='linear')
    assert len(dls) == doc_term_matrix.shape[0]
    for dl, td in zip(dls, tokenized_docs):
        assert dl == len(td)