Ejemplo n.º 1
0
def count_words(sa,
                to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Convert the content of string/dict/list type SArrays to a dictionary of
    (word, count) pairs. Dictionary keys and list elements must be strings.
    The strings are first tokenized into words according to the specified
    `to_lower` and `delimiters` options. Then, word counts are accumulated.
    In each output dictionary, the keys are the words in the corresponding
    input data entry, and the values are the number of times the words appears.
    By default, words are split on all whitespace and newline characters. The
    output is commonly known as the "bag-of-words" representation of text data.

    This function is implemented using

    Parameters
    ----------
    sa : SArray[str | dict | list]
        Input data to be tokenized and counted. 

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        Each entry contains a dictionary with the frequency count of each word
        in the corresponding input entry.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Create input data
        >>> sa = turicreate.SArray(["The quick brown fox jumps.",
        ...                       "Word word WORD, word!!!word"])

        # Run count_words
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 1, 'word!!!word': 1, 'word': 2}]

        # Run count_words with Penn treebank style tokenization to handle
        # puntuations
        >>> turicreate.text_analytics.count_words(sa, delimiters=None)
        dtype: dict
        Rows: 2
        [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1},
         {'word': 3, 'word!!!word': 1, ',': 1}]

        # Run count_words with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}]

        # Run count_words with list input
        >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}]

    """

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _turicreate.SFrame({'docs': sa})
    fe = _feature_engineering.WordCounter(features='docs',
                                          to_lower=to_lower,
                                          delimiters=delimiters,
                                          output_column_prefix=None)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']
Ejemplo n.º 2
0
def count_words(text, to_lower=True, delimiters=DEFAULT_DELIMITERS):
    """
    If `text` is an SArray of strings or an SArray of lists of strings, the
    occurances of word are counted for each row in the SArray.

    If `text` is an SArray of dictionaries, the keys are tokenized and the
    values are the counts. Counts for the same word, in the same row, are
    added together.

    This output is commonly known as the "bag-of-words" representation of text
    data.

    Parameters
    ----------
    text : SArray[str | dict | list]
        SArray of type: string, dict or list.

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using `delimiters` characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        An SArray with the same length as the`text` input. For each row, the keys
        of the dictionary are the words and the values are the corresponding counts.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_
    - `Penn treebank tokenization <https://web.archive.org/web/19970614072242/http://www.cis.upenn.edu:80/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import turicreate

        # Create input data
        >>> sa = turicreate.SArray(["The quick brown fox jumps.",
                                    "Word word WORD, word!!!word"])

        # Run count_words
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 5}]

        # Run count_words with Penn treebank style tokenization to handle
        # punctuations
        >>> turicreate.text_analytics.count_words(sa, delimiters=None)
        dtype: dict
        Rows: 2
        [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1},
         {'word': 3, 'word!!!word': 1, ',': 1}]

        # Run count_words with dictionary input
        >>> sa = turicreate.SArray([{'alice bob': 1, 'Bob alice': 0.5},
                                    {'a dog': 0, 'a dog cat': 5}])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}]

        # Run count_words with list input
        >>> sa = turicreate.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> turicreate.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}]

    """

    _raise_error_if_not_sarray(text, "text")

    ## Compute word counts
    sf = _turicreate.SFrame({"docs": text})
    fe = _feature_engineering.WordCounter(
        features="docs",
        to_lower=to_lower,
        delimiters=delimiters,
        output_column_prefix=None,
    )
    output_sf = fe.fit_transform(sf)

    return output_sf["docs"]