def test_get_doc_lengths_type(vectorizer_and_dtm): _, doc_term_matrix = vectorizer_and_dtm dls = vsm.get_doc_lengths(doc_term_matrix, type_='linear') dls_sqrt = vsm.get_doc_lengths(doc_term_matrix, type_='sqrt') dls_log = vsm.get_doc_lengths(doc_term_matrix, type_='log') assert len(dls) == len(dls_sqrt) == len(dls_log) == doc_term_matrix.shape[0] assert (dls_sqrt == np.sqrt(dls)).all() assert (dls_log == np.log(dls) + 1.0).all()
def test_get_doc_lengths_exception(vectorizer_and_dtm): _, doc_term_matrix = vectorizer_and_dtm with pytest.raises(ValueError): _ = vsm.get_doc_lengths(doc_term_matrix, type_='foo')
def test_get_doc_lengths(tokenized_docs, vectorizer_and_dtm): _, doc_term_matrix = vectorizer_and_dtm dls = vsm.get_doc_lengths(doc_term_matrix, type_='linear') assert len(dls) == doc_term_matrix.shape[0] for dl, td in zip(dls, tokenized_docs): assert dl == len(td)