Example #1
0
def phrases(s: TokenSeries,
            min_count: int = 5,
            threshold: int = 10,
            symbol: str = "_") -> TokenSeries:
    r"""Group up collocations words

    Given a pandas Series of tokenized strings, group together bigrams where
    each tokens has at least `min_count` term frequency and where the
    `threshold` is larger than the underline formula.

    :math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab }
    { (word\_a\_count * word\_b\_count)}`.

    Parameters
    ----------
    s : :class:`texthero._types.TokenSeries`

    min_count : int, optional, default=5
        Ignore tokens with frequency less than this.

    threshold : int, optional, default=10
        Ignore tokens with a score under that threshold.

    symbol : str, optional, default="_"
        Character used to join collocation words.

    Examples
    --------
    >>> import texthero as hero
    >>> s = pd.Series([['New', 'York', 'is', 'a', 'beautiful', 'city'],
    ...               ['Look', ':', 'New', 'York', '!']])
    >>> hero.phrases(s, min_count=1, threshold=1)
    0    [New_York, is, a, beautiful, city]
    1                [Look, :, New_York, !]
    dtype: object

    Reference
    --------
    `Mikolov, et. al: "Distributed Representations of Words and Phrases and
    their Compositionality"
        <https://arxiv.org/abs/1310.4546>`_

    """

    if not isinstance(s.iloc[0], list):
        warnings.warn(_not_tokenized_warning_message, DeprecationWarning)
        s = tokenize(s)

    delimiter = symbol.encode("utf-8")
    phrases = PhrasesTransformer(min_count=min_count,
                                 threshold=threshold,
                                 delimiter=delimiter)
    return pd.Series(phrases.fit_transform(s.values), index=s.index)
Example #2
0
def tokenize_with_phrases(s: pd.Series,
                          min_count: int = 5,
                          threshold: int = 10) -> pd.Series:
    r"""Tokenize and group up collocations words

    Tokenize the given pandas Series and group up bigrams where each
    token has at least min_count term frequrncy and where the threshold
    is larger than the underline formula.

    :math:`\frac{(bigram\_a\_b\_count - min\_count)* len\_vocab }{ (word\_a\_count * word\_b\_count)}`.

    Parameters
    ----------
    s : Pandas Series

    min_count : Int, optional. Default is 5.
        ignore tokens with frequency less than this

    threshold : Int, optional. Default is 10.
        ignore tokens with a score under that threshold

    Examples
    --------
    >>> import texthero as hero
    >>> import pandas as pd
    >>> s = pd.Series(["New York is a beautiful city", "Look: New York!"])
    >>> hero.tokenize_with_phrases(s, min_count=1, threshold=1)
    0    [New_York, is, a, beautiful, city]
    1                [Look, :, New_York, !]
    dtype: object

    Reference
    --------
    `Mikolov, et. al: "Distributed Representations of Words and Phrases and their Compositionality"
        <https://arxiv.org/abs/1310.4546>`_

    """

    if type(s.iloc[0]) != str:
        raise ValueError("Input series should be a list of string.")

    s = tokenize(s)
    phrases = PhrasesTransformer(min_count=min_count, threshold=threshold)
    return pd.Series(phrases.fit_transform(s.values), index=s.index)
Example #3
0
# sort to view the most common terms
# the key (lambda x: x[1]) sorts by the count
sorted(window_count.items(), key=lambda x: x[1], reverse=True)[0:20]


# ### Phrase (collocation) Detection
#
# Phrase modeling is another approach to learning combinations of tokens that together represent meaningful multi-word concepts. We can develop phrase models by looping over the the words in our reviews and looking for words that co-occur (i.e., appear one after another) together much more frequently than you would expect them to by random chance. The formula our phrase models will use to determine whether two tokens $A$ and $B$ constitute a phrase is:
#

# ##### Scikit-learn API for Gensim

from gensim.sklearn_api.phrases import PhrasesTransformer

sklearn_phrases = PhrasesTransformer(min_count=3, threshold=3)
sklearn_phrases.fit(matched_sents)

#sklearn_phrases.transform(matched_sents)
print(matched_sents)


# review phrase matches
phrases = []
for terms in sklearn_phrases.transform(matched_sents):
    for term in terms:
        if term.count('_') >= 2:
            phrases.append(term)
print(set(phrases))

# create a list of stop words
Example #4
0
 def extract_bigrams(self, X, y=None):
     return PhrasesTransformer(**self.phrases_kwargs).fit_transform(X)
Example #5
0
 def transform(self, X, y=None):
     return [list(filter(lambda x: '_' in x, ls)) for ls in
             PhrasesTransformer(**self.phrases_kwargs).fit_transform(X)]