Exemple #1
0
def test_vectorizer_weighting_combinations(tokenized_docs):
    init_params = [
        dict(tf_type='linear'),
        dict(tf_type='sqrt'),
        dict(tf_type='sqrt', apply_dl=True),
        dict(tf_type='sqrt', apply_dl=True, dl_type='sqrt'),
        dict(tf_type='linear', apply_idf=True),
        dict(tf_type='linear', apply_idf=True, idf_type='bm25'),
        dict(tf_type='linear', apply_idf=True, idf_type='standard', norm='l1'),
        dict(tf_type='linear', apply_idf=True, idf_type='standard', apply_dl=True),
        dict(tf_type='linear', apply_idf=True, idf_type='smooth', apply_dl=True, dl_type='log'),
        dict(tf_type='bm25', apply_idf=True, idf_type='bm25'),
        dict(tf_type='bm25', apply_idf=True, apply_dl=False),
        dict(tf_type='bm25', apply_idf=True, idf_type='bm25'),
        dict(tf_type='bm25', apply_idf=True, idf_type='smooth', norm='l2'),
    ]
    for ip in init_params:
        vectorizer = vsm.Vectorizer(**ip)
        doc_term_matrix = vectorizer.fit(tokenized_docs)
        vectorizer.weighting
Exemple #2
0
def test_vectorizer_weighting_combinations(tokenized_docs):
    init_params = [
        dict(tf_type="linear"),
        dict(tf_type="sqrt"),
        dict(tf_type="sqrt", apply_dl=True),
        dict(tf_type="sqrt", apply_dl=True, dl_type="sqrt"),
        dict(tf_type="linear", apply_idf=True),
        dict(tf_type="linear", apply_idf=True, idf_type="bm25"),
        dict(tf_type="linear", apply_idf=True, idf_type="standard", norm="l1"),
        dict(tf_type="linear", apply_idf=True, idf_type="standard", apply_dl=True),
        dict(tf_type="linear", apply_idf=True, idf_type="smooth", apply_dl=True, dl_type="log"),
        dict(tf_type="bm25", apply_idf=True, idf_type="bm25"),
        dict(tf_type="bm25", apply_idf=True, apply_dl=False),
        dict(tf_type="bm25", apply_idf=True, idf_type="bm25"),
        dict(tf_type="bm25", apply_idf=True, idf_type="smooth", norm="l2"),
    ]
    for ip in init_params:
        vectorizer = vsm.Vectorizer(**ip)
        doc_term_matrix = vectorizer.fit(tokenized_docs)
        vectorizer.weighting
Exemple #3
0
 def test_vectorizer_bad_init_params(self):
     bad_init_params = (
         {
             'min_df': -1
         },
         {
             'max_df': -1
         },
         {
             'max_n_terms': -1
         },
         {
             'min_ic': -1.0
         },
         {
             'vocabulary': 'foo bar bat baz'
         },
     )
     for bad_init_param in bad_init_params:
         with self.assertRaises(ValueError):
             vsm.Vectorizer(**bad_init_param)
Exemple #4
0
def test_vectorizer_bad_transform(tokenized_docs):
    vectorizer = vsm.Vectorizer()
    with pytest.raises(ValueError):
        _ = vectorizer.transform(tokenized_docs)
Exemple #5
0
def most_discriminating_terms(terms_lists,
                              bool_array_grp1,
                              max_n_terms=1000,
                              top_n_terms=25):
    """
    Given a collection of documents assigned to 1 of 2 exclusive groups, get the
    `top_n_terms` most discriminating terms for group1-and-not-group2 and
    group2-and-not-group1.

    Args:
        terms_lists (Iterable[Iterable[str]]): a sequence of documents, each as a
            sequence of (str) terms; used as input to :func:`doc_term_matrix()`
        bool_array_grp1 (Iterable[bool]): an ordered sequence of True/False values,
            where True corresponds to documents falling into "group 1" and False
            corresponds to those in "group 2"
        max_n_terms (int): only consider terms whose document frequency is within
            the top `max_n_terms` out of all distinct terms; must be > 0
        top_n_terms (int or float): if int (must be > 0), the total number of most
            discriminating terms to return for each group; if float (must be in
            the interval (0, 1)), the fraction of `max_n_terms` to return for each group

    Returns:
        List[str]: top `top_n_terms` most discriminating terms for grp1-not-grp2
        List[str]: top `top_n_terms` most discriminating terms for grp2-not-grp1

    References:
        King, Gary, Patrick Lam, and Margaret Roberts. "Computer-Assisted Keyword
            and Document Set Discovery from Unstructured Text." (2014).
            http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.458.1445&rep=rep1&type=pdf
    """
    alpha_grp1 = 1
    alpha_grp2 = 1
    if isinstance(top_n_terms, float):
        top_n_terms = top_n_terms * max_n_terms
    bool_array_grp1 = np.array(bool_array_grp1)
    bool_array_grp2 = np.invert(bool_array_grp1)

    vectorizer = vsm.Vectorizer(weighting='tf',
                                normalize=False,
                                sublinear_tf=False,
                                smooth_idf=True,
                                min_df=3,
                                max_df=0.95,
                                min_ic=0.0,
                                max_n_terms=max_n_terms)
    dtm = vectorizer.fit_transform(terms_lists)
    id2term = vectorizer.id_to_term

    # get doc freqs for all terms in grp1 documents
    dtm_grp1 = dtm[bool_array_grp1, :]
    n_docs_grp1 = dtm_grp1.shape[0]
    doc_freqs_grp1 = vsm.get_doc_freqs(dtm_grp1, normalized=False)

    # get doc freqs for all terms in grp2 documents
    dtm_grp2 = dtm[bool_array_grp2, :]
    n_docs_grp2 = dtm_grp2.shape[0]
    doc_freqs_grp2 = vsm.get_doc_freqs(dtm_grp2, normalized=False)

    # get terms that occur in a larger fraction of grp1 docs than grp2 docs
    term_ids_grp1 = np.where(
        doc_freqs_grp1 / n_docs_grp1 > doc_freqs_grp2 / n_docs_grp2)[0]

    # get terms that occur in a larger fraction of grp2 docs than grp1 docs
    term_ids_grp2 = np.where(
        doc_freqs_grp1 / n_docs_grp1 < doc_freqs_grp2 / n_docs_grp2)[0]

    # get grp1 terms doc freqs in and not-in grp1 and grp2 docs, plus marginal totals
    grp1_terms_grp1_df = doc_freqs_grp1[term_ids_grp1]
    grp1_terms_grp2_df = doc_freqs_grp2[term_ids_grp1]
    # grp1_terms_grp1_not_df = n_docs_grp1 - grp1_terms_grp1_df
    # grp1_terms_grp2_not_df = n_docs_grp2 - grp1_terms_grp2_df
    # grp1_terms_total_df = grp1_terms_grp1_df + grp1_terms_grp2_df
    # grp1_terms_total_not_df = grp1_terms_grp1_not_df + grp1_terms_grp2_not_df

    # get grp2 terms doc freqs in and not-in grp2 and grp1 docs, plus marginal totals
    grp2_terms_grp2_df = doc_freqs_grp2[term_ids_grp2]
    grp2_terms_grp1_df = doc_freqs_grp1[term_ids_grp2]
    # grp2_terms_grp2_not_df = n_docs_grp2 - grp2_terms_grp2_df
    # grp2_terms_grp1_not_df = n_docs_grp1 - grp2_terms_grp1_df
    # grp2_terms_total_df = grp2_terms_grp2_df + grp2_terms_grp1_df
    # grp2_terms_total_not_df = grp2_terms_grp2_not_df + grp2_terms_grp1_not_df

    # get grp1 terms likelihoods, then sort for most discriminating grp1-not-grp2 terms
    grp1_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp1):
        term1 = Decimal(
            math.
            factorial(grp1_terms_grp1_df[idx] + alpha_grp1 - 1)) * Decimal(
                math.factorial(grp1_terms_grp2_df[idx] + alpha_grp2 -
                               1)) / Decimal(
                                   math.factorial(grp1_terms_grp1_df[idx] +
                                                  grp1_terms_grp2_df[idx] +
                                                  alpha_grp1 + alpha_grp2 - 1))
        term2 = Decimal(
            math.factorial(n_docs_grp1 - grp1_terms_grp1_df[idx] + alpha_grp1 -
                           1)
        ) * Decimal(
            math.factorial(n_docs_grp2 - grp1_terms_grp2_df[idx] + alpha_grp2 -
                           1)) / Decimal(
                               (math.factorial(n_docs_grp1 + n_docs_grp2 -
                                               grp1_terms_grp1_df[idx] -
                                               grp1_terms_grp2_df[idx] +
                                               alpha_grp1 + alpha_grp2 - 1)))
        grp1_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp1_terms = [
        term for term, likelihood in sorted(grp1_terms_likelihoods.items(),
                                            key=itemgetter(1),
                                            reverse=True)[:top_n_terms]
    ]

    # get grp2 terms likelihoods, then sort for most discriminating grp2-not-grp1 terms
    grp2_terms_likelihoods = {}
    for idx, term_id in enumerate(term_ids_grp2):
        term1 = Decimal(
            math.
            factorial(grp2_terms_grp2_df[idx] + alpha_grp2 - 1)) * Decimal(
                math.factorial(grp2_terms_grp1_df[idx] + alpha_grp1 -
                               1)) / Decimal(
                                   math.factorial(grp2_terms_grp2_df[idx] +
                                                  grp2_terms_grp1_df[idx] +
                                                  alpha_grp2 + alpha_grp1 - 1))
        term2 = Decimal(
            math.factorial(n_docs_grp2 - grp2_terms_grp2_df[idx] + alpha_grp2 -
                           1)
        ) * Decimal(
            math.factorial(n_docs_grp1 - grp2_terms_grp1_df[idx] + alpha_grp1 -
                           1)) / Decimal(
                               (math.factorial(n_docs_grp2 + n_docs_grp1 -
                                               grp2_terms_grp2_df[idx] -
                                               grp2_terms_grp1_df[idx] +
                                               alpha_grp2 + alpha_grp1 - 1)))
        grp2_terms_likelihoods[id2term[term_id]] = term1 * term2
    top_grp2_terms = [
        term for term, likelihood in sorted(grp2_terms_likelihoods.items(),
                                            key=itemgetter(1),
                                            reverse=True)[:top_n_terms]
    ]

    return (top_grp1_terms, top_grp2_terms)
Exemple #6
0
def vectorizer_and_dtm(tokenized_docs):
    vectorizer = vsm.Vectorizer(
        tf_type='linear', idf_type='smooth', norm=None,
        min_df=1, max_df=1.0, max_n_terms=None)
    doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
    return vectorizer, doc_term_matrix
Exemple #7
0
def vectorizer_and_dtm(tokenized_docs):
    vectorizer = vsm.Vectorizer(
        weighting='tf', normalize=False, sublinear_tf=False, smooth_idf=True,
        min_df=1, max_df=1.0, min_ic=0.0, max_n_terms=None)
    doc_term_matrix = vectorizer.fit_transform(tokenized_docs)
    return vectorizer, doc_term_matrix