def grp_vectorizer_and_gtm(tokenized_docs, groups): grp_vectorizer = vsm.GroupVectorizer(tf_type='linear', idf_type='smooth', norm=None, min_df=1, max_df=1.0, max_n_terms=None) grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups) return grp_vectorizer, grp_term_matrix
def test_grp_vectorizer_fixed_vocab(tokenized_docs, groups): vocabulary_terms = ['lamb', 'snow', 'school', 'rule', 'teacher'] vocabulary_grps = ['a', 'b'] grp_vectorizer = vsm.GroupVectorizer(vocabulary_terms=vocabulary_terms, vocabulary_grps=vocabulary_grps) grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups) assert len(grp_vectorizer.vocabulary_terms) == len(vocabulary_terms) assert grp_term_matrix.shape[1] == len(vocabulary_terms) assert sorted(grp_vectorizer.terms_list) == sorted(vocabulary_terms) assert len(grp_vectorizer.vocabulary_grps) == len(vocabulary_grps) assert grp_term_matrix.shape[0] == len(vocabulary_grps) assert sorted(grp_vectorizer.grps_list) == sorted(vocabulary_grps)
def grp_vectorizer_and_gtm_2(tokenized_docs, groups): grp_vectorizer = vsm.GroupVectorizer( tf_type="bm25", idf_type="smooth", norm=None, apply_dl=True, min_df=1, max_df=1.0, max_n_terms=None, ) grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups) return grp_vectorizer, grp_term_matrix
def test_grp_vectorizer_fixed_vocab(tokenized_docs, groups): vocabulary_terms = ["lamb", "snow", "school", "rule", "teacher"] vocabulary_grps = ["a", "b"] grp_vectorizer = vsm.GroupVectorizer(vocabulary_terms=vocabulary_terms, vocabulary_grps=vocabulary_grps) grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups) assert len(grp_vectorizer.vocabulary_terms) == len(vocabulary_terms) assert grp_term_matrix.shape[1] == len(vocabulary_terms) assert sorted(grp_vectorizer.terms_list) == sorted(vocabulary_terms) assert len(grp_vectorizer.vocabulary_grps) == len(vocabulary_grps) assert grp_term_matrix.shape[0] == len(vocabulary_grps) assert sorted(grp_vectorizer.grps_list) == sorted(vocabulary_grps)
def test_grp_vectorizer_bad_transform(tokenized_docs, groups): grp_vectorizer = vsm.GroupVectorizer() with pytest.raises(ValueError): _ = grp_vectorizer.transform(tokenized_docs, groups)
def grp_vectorizer_and_gtm(tokenized_docs, groups): grp_vectorizer = vsm.GroupVectorizer( weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True, min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None) grp_term_matrix = grp_vectorizer.fit_transform(tokenized_docs, groups) return grp_vectorizer, grp_term_matrix