Esempio n. 1
0
def grp_vectorizer_and_gtm(tokenized_docs, groups):
    grp_vectorizer = vsm.GroupVectorizer(tf_type='linear',
                                         idf_type='smooth',
                                         norm=None,
                                         min_df=1,
                                         max_df=1.0,
                                         max_n_terms=None)
    grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups)
    return grp_vectorizer, grp_term_matrix
Esempio n. 2
0
def test_grp_vectorizer_fixed_vocab(tokenized_docs, groups):
    vocabulary_terms = ['lamb', 'snow', 'school', 'rule', 'teacher']
    vocabulary_grps = ['a', 'b']
    grp_vectorizer = vsm.GroupVectorizer(vocabulary_terms=vocabulary_terms,
                                         vocabulary_grps=vocabulary_grps)
    grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups)
    assert len(grp_vectorizer.vocabulary_terms) == len(vocabulary_terms)
    assert grp_term_matrix.shape[1] == len(vocabulary_terms)
    assert sorted(grp_vectorizer.terms_list) == sorted(vocabulary_terms)
    assert len(grp_vectorizer.vocabulary_grps) == len(vocabulary_grps)
    assert grp_term_matrix.shape[0] == len(vocabulary_grps)
    assert sorted(grp_vectorizer.grps_list) == sorted(vocabulary_grps)
Esempio n. 3
0
def grp_vectorizer_and_gtm_2(tokenized_docs, groups):
    grp_vectorizer = vsm.GroupVectorizer(
        tf_type="bm25",
        idf_type="smooth",
        norm=None,
        apply_dl=True,
        min_df=1,
        max_df=1.0,
        max_n_terms=None,
    )
    grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups)
    return grp_vectorizer, grp_term_matrix
Esempio n. 4
0
def test_grp_vectorizer_fixed_vocab(tokenized_docs, groups):
    vocabulary_terms = ["lamb", "snow", "school", "rule", "teacher"]
    vocabulary_grps = ["a", "b"]
    grp_vectorizer = vsm.GroupVectorizer(vocabulary_terms=vocabulary_terms,
                                         vocabulary_grps=vocabulary_grps)
    grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups)
    assert len(grp_vectorizer.vocabulary_terms) == len(vocabulary_terms)
    assert grp_term_matrix.shape[1] == len(vocabulary_terms)
    assert sorted(grp_vectorizer.terms_list) == sorted(vocabulary_terms)
    assert len(grp_vectorizer.vocabulary_grps) == len(vocabulary_grps)
    assert grp_term_matrix.shape[0] == len(vocabulary_grps)
    assert sorted(grp_vectorizer.grps_list) == sorted(vocabulary_grps)
Esempio n. 5
0
def test_grp_vectorizer_bad_transform(tokenized_docs, groups):
    grp_vectorizer = vsm.GroupVectorizer()
    with pytest.raises(ValueError):
        _ = grp_vectorizer.transform(tokenized_docs, groups)
Esempio n. 6
0
def grp_vectorizer_and_gtm(tokenized_docs, groups):
    grp_vectorizer = vsm.GroupVectorizer(
        weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
        min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
    grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups)
    return grp_vectorizer, grp_term_matrix