def tf_idf(dataset):
    """
    Compute the TF-IDF scores for each word in each document. The collection
    of documents must be in bag-of-words format.

    .. math::
        \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w))

    where :math:`tf(w, d)` is the number of times word :math:`w` appeared in
    document :math:`d`, :math:`f(w)` is the number of documents word :math:`w`
    appeared in, :math:`N` is the number of documents, and we use the
    natural logarithm.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`.

    Parameters
    ----------
    dataset : SArray[str | dict | list]
        Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`
        documentation for details on how string, dict, and list inputs are handled.

    Returns
    -------
    out : SArray[dict]
        The same document corpus where each score has been replaced by the
        TF-IDF transformation.

    See Also
    --------
    count_words, count_ngrams, tokenize,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF

    References
    ----------
    - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> docs = graphlab.SArray('https://static.turi.com/datasets/nips-text')
        >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs)
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf')

    _raise_error_if_not_sarray(dataset, "dataset")

    if len(dataset) == 0:
        return _graphlab.SArray()

    dataset = _graphlab.SFrame({'docs': dataset})
    scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset)

    return scores['docs']
Example #2
0
def tf_idf(dataset):
    """
    Compute the TF-IDF scores for each word in each document. The collection
    of documents must be in bag-of-words format.

    .. math::
        \mbox{TF-IDF}(w, d) = tf(w, d) * log(N / f(w))

    where :math:`tf(w, d)` is the number of times word :math:`w` appeared in
    document :math:`d`, :math:`f(w)` is the number of documents word :math:`w`
    appeared in, :math:`N` is the number of documents, and we use the
    natural logarithm.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`.

    Parameters
    ----------
    dataset : SArray[str | dict | list]
        Input text data. See :py:class:`~graphlab.toolkits.feature_engineering._tf_idf.TFIDF`
        documentation for details on how string, dict, and list inputs are handled.

    Returns
    -------
    out : SArray[dict]
        The same document corpus where each score has been replaced by the
        TF-IDF transformation.

    See Also
    --------
    count_words, count_ngrams, tokenize,
    graphlab.toolkits.feature_engineering._tfidf.TFIDF

    References
    ----------
    - `Wikipedia - TF-IDF <https://en.wikipedia.org/wiki/TFIDF>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> docs = graphlab.SArray('http://s3.amazonaws.com/dato-datasets/nips-text')
        >>> docs_tfidf = graphlab.text_analytics.tf_idf(docs)
    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.tf_idf')

    _raise_error_if_not_sarray(dataset, "dataset")

    if len(dataset) == 0:
        return _graphlab.SArray()

    dataset = _graphlab.SFrame({'docs': dataset})
    scores = _graphlab.feature_engineering.TFIDF('docs').fit_transform(dataset)

    return scores['docs']
def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (targets.size() != predictions.size()):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")
Example #4
0
def _supervised_evaluation_error_checking(targets, predictions):
    """
    Perform basic error checking for the evaluation metrics. Check
    types and sizes of the inputs.
    """
    _raise_error_if_not_sarray(targets, "targets")
    _raise_error_if_not_sarray(predictions, "predictions")
    if (targets.size() != predictions.size()):
        raise _ToolkitError(
         "Input SArrays 'targets' and 'predictions' must be of the same length.")
def split_by_sentence(sa):
    """
    The SentenceSplitter takes SArrays of type string or list, and returns an
    SArray of type list of strings, where each element is a single sentence.
    If the input SArroy is of type list, each element is either a list or sring.
    The list is flattened and concatenated, and then is split by sentence.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._sentence_splitter.SentenceSplitter`.

    Parameters
    ----------
    sa : SArray[str]
        Input data to be split by sentence.

    Returns
    -------
    out : SArray[list]
        Each element of the list is a sentence.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,
    graphlab.toolkits.feature_engineering._sentence_splitter.SentenceSplitter

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Create input data
        >>> sa = graphlab.SArray(["The quick brown fox jumps.The slow brown fox" +
            " crawls"])

        # Run split_by_sentence
        >>> graphlab.text_analytics.split_by_sentence(sa)
        dtype: list
        Rows: 1
        [['The quick brown fox jumps.', 'The slow brown fox crawls']]

        # Input SArray of type list
        >>> sa = graphlab.SArray([["The quick brown fox jumps.", "The slow brown fox" +
            " crawls"]])

        # Run split_by_sentence
        >>> graphlab.text_analytics.split_by_sentence(sa)
        dtype: list
        Rows: 1
        [['The quick brown fox jumps.', 'The slow brown fox crawls']]

    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.split_by_sentence')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.SentenceSplitter(features='docs',
                                                   output_column_prefix=None,
                                                   verbose=False)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']
def tokenize(sa, to_lower=False,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Tokenize the input SArray of text strings and return the list of tokens.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._tokenizer.Tokenizer`.
    Please refer to the Tokenizer documentation for details about how
    tokenization is done.

    Parameters
    ----------
    sa : SArray[str]
        Input data of strings representing English text. This tokenizer is not
        intended to process XML, HTML, or other structured text formats.

    to_lower : bool, optional
        If True, all strings are converted to lower case before tokenization.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[list]
        Each text string in the input is mapped to a list of tokens.

    See Also
    --------
    count_words, count_ngrams, tf_idf,
    graphlab.toolkits.feature_engineering._tokenizer.Tokenizer

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> docs = graphlab.SArray(['This is the first sentence.',
        ...                         'This one, it\'s the second sentence.'])

        # Default tokenization by space characters
        >>> graphlab.text_analytics.tokenize(docs)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence.'],
         ['This', 'one,', "it's", 'the', 'second', 'sentence.']]

        # Penn treebank-style tokenization
        >>> graphlab.text_analytics.tokenize(docs, delimiters=None)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence', '.'],
         ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']]

    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.tokenize')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.Tokenizer(features='docs',
                                                 to_lower=to_lower,
                                                 delimiters=delimiters,
                                                 output_column_prefix=None)
    tokens = fe.fit_transform(sf)

    return tokens['docs']
def trim_rare_words(sa, threshold=2, to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "], stopwords=None):
    '''
    Remove words that occur below a certain number of times in an SArray.
    This is a common method of cleaning text before it is used, and can increase the
    quality and explainability of the models learned on the transformed data.

    RareWordTrimmer can be applied to all the string-, dictionary-, and list-typed
    columns in an SArray.

    * **string** : The string is first tokenized. By default, all letters are
      first converted to lower case, then tokenized by space characters. Each
      token is taken to be a word, and the words occuring below a threshold
      number of times across the entire column are removed, then the remaining
      tokens are concatenated back into a string.

    * **list** : Each element of the list must be a string, where each element
      is assumed to be a token. The remaining tokens are then filtered
      by count occurences and a threshold value.

    * **dict** : The method first obtains the list of keys in the dictionary.
      This list is then processed as a standard list, except the value of each
      key must be of integer type and is considered to be the count of that key.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._word_trimmer.RareWordTrimmer`.


    Parameters
    ----------
    sa: SArray[str | dict | list]
        The input text data.

    threshold : int, optional
        The count below which words are removed from the input.

    stopwords: list[str], optional
        A manually specified list of stopwords, which are removed regardless
        of count.

    to_lower : bool, optional
        Indicates whether to map the input strings to lower case before counting.

    delimiters: list[string], optional
        A list of delimiter characters for tokenization. By default, the list
        is defined to be the list of space characters. The user can define
        any custom list of single-character delimiters. Alternatively, setting
        `delimiters=None` will use a Penn treebank type tokenization, which
        is better at handling punctuations. (See reference below for details.)

    Returns
    -------
    out : SArray.
        An SArray with words below a threshold removed.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,
    graphlab.toolkits.feature_engineering._word_trimmer.RareWordTrimmer

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Create input data
        >>> sa = graphlab.SArray(["The quick brown fox jumps in a fox like way.",
        ...                       "Word word WORD, word!!!word"])

        # Run trim_rare_words
        >>> graphlab.text_analytics.trim_rare_words(sa)
        dtype: str
        Rows: 2
        ['fox fox', 'word word']

        # Run trim_rare_words with Penn treebank style tokenization to handle
        # puntuations
        >>> graphlab.text_analytics.trim_rare_words(sa, delimiters=None)
        dtype: str
        Rows: 2
        ['fox fox', 'word word word']

        # Run trim_rare_words with dictionary input
        >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 2},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> graphlab.text_analytics.trim_rare_words(sa)
        dtype: dict
        Rows: 2
        [{'bob alice': 2}, {'a dog cat': 5}]

        # Run trim_rare_words with list input
        >>> sa = graphlab.SArray([['one', 'bar bah', 'One'],
        ...                     ['a dog', 'a dog cat', 'A DOG']])
        >>> graphlab.text_analytics.trim_rare_words(sa)
        dtype: list
        Rows: 2
        [['one', 'one'], ['a dog', 'a dog']]


'''

    _mt._get_metric_tracker().track('toolkit.text_analytics.trim_rare_words')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.RareWordTrimmer(features='docs',
                                                 threshold=threshold,
                                                 to_lower=to_lower,
                                                 delimiters=delimiters,
                                                 stopwords=stopwords,
                                                 output_column_prefix=None)
    tokens = fe.fit_transform(sf)

    return tokens['docs']
def count_ngrams(sa, n=2, method="word", to_lower=True,
    delimiters=["\r", "\v", "\n", "\f", "\t", " ",
                "!", "#", "$", "%", "&", "'", "(", ")",
                "*", "+", ",", "-", ".", "/", ":", ";",
                "<", "=", ">", "?", "@", "[", "\\", "]",
                "^", "_", "`", "{", "|", "}", "~"],
    ignore_punct=True, ignore_space=True):
    """
    count_ngrams(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " ", "!", "#", "$", "%", "&", "'", "(", ")", "*", "+", ",", "-", ".", "/", ":", ";", "<", "=", ">", "?", "@", "[", "\\\\", "]", "^", "_", "`", "{", "|", "}", "~"], ignore_punct=True, ignore_space=True)

    Return an SArray of ``dict`` type where each element contains the count
    for each of the n-grams that appear in the corresponding input element.
    The n-grams can be specified to be either character n-grams or word
    n-grams.  The input SArray could contain strings, dicts with string keys
    and numeric values, or lists of strings.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter`.

    Parameters
    ----------
    sa : SArray[str | dict | list]
        Input text data. See
        :py:class:`~graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter`
        documentation for details on how string, dict, and list inputs are handled.

    n : int, optional
        The number of words in each n-gram. An ``n`` value of 1 returns word
        counts.

    method : {'word', 'character'}, optional
        If "word", the function performs a count of word n-grams. If
        "character", does a character n-gram count.

    to_lower : bool, optional
        If True, all words are converted to lower case before counting.

    delimiters : list[str], None, optional
        If method is "word", input strings are tokenized using delimiter
        characters in this list. Each entry in this list must contain a single
        character. If set to `None`, then a Penn treebank-style tokenization is
        used, which contains smart handling of punctuations. If method is
        "character," this option is ignored.

    ignore_punct : bool, optional
        If method is "character", indicates if *punctuations* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun.games", if this parameter is set to False one
        tri-gram would be 'n.g'. If ``ignore_punct`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    ignore_space : bool, optional
        If method is "character", indicates if *spaces* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun games", if this parameter is set to False one
        tri-gram would be 'n g'. If ``ignore_space`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    Returns
    -------
    out : SArray[dict]
      An SArray of dictionary type, where each key is the n-gram string
      and each value is its count.

    See Also
    --------
    count_words, tokenize,
    graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter

    Notes
    -----
    - Ignoring case (with ``to_lower``) involves a full string copy of the
      SArray data. To increase speed for large documents, set ``to_lower`` to
      False.

    - Punctuation and spaces are both delimiters by default when counting
      word n-grams. When counting character n-grams, one may choose to ignore
      puncutations, spaces, neither, or both.

    References
    ----------
    - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Counting word n-grams:
        >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.'])
        >>> graphlab.text_analytics.count_ngrams(sa, 3)
        dtype: dict
        Rows: 1
        [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]

        # Counting character n-grams:
        >>> sa = graphlab.SArray(['Fun. Is. Fun'])
        >>> graphlab.text_analytics.count_ngrams(sa, 3, "character")
        dtype: dict
        Rows: 1
        {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]

        # Run count_ngrams with dictionary input
        >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 0.5},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> graphlab.text_analytics.count_ngrams(sa)
        dtype: dict
        Rows: 2
        [{'bob alice': 0.5, 'alice bob': 1}, {'dog cat': 5, 'a dog': 5}]

        # Run count_ngrams with list input
        >>> sa = graphlab.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> graphlab.text_analytics.count_ngrams(sa)
        dtype: dict
        Rows: 2
        [{'bar bah': 1}, {'dog cat': 1, 'a dog': 2}]
    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.NGramCounter(features='docs',
                                                    n=n,
                                                    method=method,
                                                    to_lower=to_lower,
                                                    delimiters=delimiters,
                                                    ignore_punct=ignore_punct,
                                                    ignore_space=ignore_space,
                                                    output_column_prefix=None)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']
def count_words(sa, to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    count_words(sa, to_lower=True, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Convert the content of string/dict/list type SArrays to a dictionary of
    (word, count) pairs. Dictionary keys and list elements must be strings.
    The strings are first tokenized into words according to the specified
    `to_lower` and `delimiters` options. Then, word counts are accumulated.
    In each output dictionary, the keys are the words in the corresponding
    input data entry, and the values are the number of times the words appears.
    By default, words are split on all whitespace and newline characters. The
    output is commonly known as the "bag-of-words" representation of text data.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter`.

    Parameters
    ----------
    sa : SArray[str | dict | list]
        Input data to be tokenized and counted. See
        :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter`
        documentation for details on how string, dict, and list inputs are handled.

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        Each entry contains a dictionary with the frequency count of each word
        in the corresponding input entry.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,
    graphlab.toolkits.feature_engineering._word_counter.WordCounter

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Create input data
        >>> sa = graphlab.SArray(["The quick brown fox jumps.",
        ...                       "Word word WORD, word!!!word"])

        # Run count_words
        >>> graphlab.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 1, 'word!!!word': 1, 'word': 2}]

        # Run count_words with Penn treebank style tokenization to handle
        # puntuations
        >>> graphlab.text_analytics.count_words(sa, delimiters=None)
        dtype: dict
        Rows: 2
        [{'brown': 1, 'jumps': 1, 'fox': 1, '.': 1, 'quick': 1, 'the': 1},
         {'word': 3, 'word!!!word': 1, ',': 1}]

        # Run count_words with dictionary input
        >>> sa = graphlab.SArray([{'alice bob': 1, 'Bob alice': 0.5},
        ...                       {'a dog': 0, 'a dog cat': 5}])
        >>> graphlab.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bob': 1.5, 'alice': 1.5}, {'a': 5, 'dog': 5, 'cat': 5}]

        # Run count_words with list input
        >>> sa = graphlab.SArray([['one', 'bar bah'], ['a dog', 'a dog cat']])
        >>> graphlab.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'bar': 1, 'bah': 1, 'one': 1}, {'a': 2, 'dog': 2, 'cat': 1}]

    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_words')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.WordCounter(features='docs',
                                                   to_lower=to_lower,
                                                   delimiters=delimiters,
                                                   output_column_prefix=None)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']
def extract_parts_of_speech(sa, chosen_pos=[PartOfSpeech.ADJ]):
    """
    This function takes SArrays of type string or list, along with a list of parts of
    speech. If the input SArray is of type list, each element must be of type
    list or string. The output is of type dict, where each key is a part of speech,
    and the values are bags-of-words of that part of speech.

    and returns an SArray of type list where the elements are the words
    in the string which belong to that part of speech.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._part_of_speech_extractor.PartOfSpeechExtractor`.

    Parameters
    ----------
    sa : SArray[str]
        Input data to extract certain parts of speech from.

    chosen_pos: list[graphlab.text_analytics.PartOfSpeech], optional
        List of parts of speech enumerations as found in the
        graphlab.text_analytics.parts_of_speech namespace. The transformer
        will only select words of this part of speech. By default it
        selects adjectives.

    Returns
    -------
    out : SArray[list]
        Each element of the list is a word belonging to the parts of speech
        described by chosen_pos.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,
    graphlab.toolkits.feature_engineering._part_of_speech_extractor.PartOfSpeechExtractor

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Create input data
        >>> sa = graphlab.SArray(["The quick brown fox jumps.The slow brown fox" +
            " crawls"])

        # Run extract_parts_of_speech
        >>> graphlab.text_analytics.extract_parts_of_speech(sa)
        dtype: dict
        Rows: 1
        [{'ADJ': {'quick': 1, 'brown': 1, 'slow': 1}}]


        # List type input
        # Create input data
        >>> sa = graphlab.SArray([["The quick brown fox jumps.","The slow brown fox" +
            " crawls"]])

        # Run extract_parts_of_speech
        >>> graphlab.text_analytics.extract_parts_of_speech(sa)
        dtype: dict
        Rows: 1
        [{'ADJ': {'quick': 1, 'brown': 1, 'slow': 1}}]

"""

    _mt._get_metric_tracker().track('toolkit.text_analytics.extract_parts_of_speech')

    _raise_error_if_not_sarray(sa, "sa")

    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.PartOfSpeechExtractor(features='docs',
                                                   chosen_pos = chosen_pos,
                                                   output_column_prefix=None,
                                                   verbose=False)
    output_sf = fe.fit_transform(sf)

    return output_sf['docs']
Example #11
0
def count_ngrams(sa, n=2, method="word", to_lower=True, ignore_space=True):
    """

    Return an SArray of ``dict`` type where each element contains the count
    for each of the n-grams that appear in the corresponding input element.
    The n-grams can be specified to be either character n-grams or word
    n-grams.

    Parameters
    ----------
    sa : SArray[str]
        Input text data.

    n : int, optional
        The number of words in each n-gram. An ``n`` value of 1 returns word
        counts.

    method : {'word', 'character'}, optional
        If "word", the function performs a count of word n-grams. If
        "character", does a character n-gram count.

    to_lower : bool, optional
        If True, all words are converted to lower case before counting.

    ignore_space : bool, optional
        If method is "character", indicates if *spaces* between words are
        counted as part of the n-gram. For instance, with the input SArray
        element of "fun games", if this parameter is set to False one
        tri-gram would be 'n g'. If ``ignore_space`` is set to True, there
        would be no such tri-gram (there would still be 'nga'). This
        parameter has no effect if the method is set to "word".

    Returns
    -------
    out : SArray[dict]
      An SArray of dictionary type, where each key is the n-gram string
      and each value is its count.

    See Also
    --------
    count_words, tokenize,
    graphlab.toolkits.feature_engineering._ngram_counter.NGramCounter

    Notes
    -----
    - Ignoring case (with ``to_lower``) involves a full string copy of the
      SArray data. To increase speed for large documents, set ``to_lower`` to
      False.

    - Punctuation and spaces are both delimiters by default when counting
      word n-grams. When counting character n-grams, one may choose to ignore
      puncutations, spaces, neither, or both.

    References
    ----------
    - `N-gram wikipedia article <http://en.wikipedia.org/wiki/N-gram>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Counting word n-grams:
        >>> sa = graphlab.SArray(['I like big dogs. I LIKE BIG DOGS.'])
        >>> graphlab.text_analytics.count_ngrams(sa, 3)
        dtype: dict
        Rows: 1
        [{'big dogs i': 1, 'like big dogs': 2, 'dogs i like': 1, 'i like big': 2}]

        # Counting character n-grams:
        >>> sa = graphlab.SArray(['Fun. Is. Fun'])
        >>> graphlab.text_analytics.count_ngrams(sa, 3, "character")
        dtype: dict
        Rows: 1
        {'fun': 2, 'nis': 1, 'sfu': 1, 'isf': 1, 'uni': 1}]
    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_ngrams')
    _raise_error_if_not_sarray(sa, "sa")
    _raise_error_if_not_of_type(to_lower, [bool])
    _raise_error_if_not_of_type(n, [int])
    _raise_error_if_not_of_type(method, [str])
    _raise_error_if_not_of_type(ignore_space, [bool])

    if n < 1:
      raise ValueError("Input 'n' must be greater than 0")
    if method != "word" and method != "character":
      raise ValueError("Invalid 'method' input  value. Please input " +
                             "either 'word' or 'character' ")
    if n > 5 and method == 'word':
      warnings.warn("It is unusual for n-grams to be of size larger than 5.")
    return _graphlab.extensions._count_ngrams(sa, n, method, to_lower,
            ignore_space)
Example #12
0
def tokenize(sa, to_lower=False,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    tokenize(sa, to_lower=False, delimiters=["\\\\r", "\\\\v", "\\\\n", "\\\\f", "\\\\t", " "])

    Tokenize the input SArray of text strings and return the list of tokens.

    This function is implemented using
    :py:class:`~graphlab.toolkits.feature_engineering._tokenizer.Tokenizer`.
    Please refer to the Tokenizer documentation for details about how
    tokenization is done.

    Parameters
    ----------
    sa : SArray[str]
        Input data of strings representing English text. This tokenizer is not
        intended to process XML, HTML, or other structured text formats.

    to_lower : bool, optional
        If True, all strings are converted to lower case before tokenization.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[list]
        Each text string in the input is mapped to a list of tokens.

    See Also
    --------
    count_words, count_ngrams, tf_idf,
    graphlab.toolkits.feature_engineering._tokenizer.Tokenizer

    References
    ----------
    - `Penn treebank tokenization <https://www.cis.upenn.edu/~treebank/tokenization.html>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        >>> docs = graphlab.SArray(['This is the first sentence.',
        ...                         'This one, it\'s the second sentence.'])

        # Default tokenization by space characters
        >>> graphlab.text_analytics.tokenize(docs)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence.'],
         ['This', 'one,', "it's", 'the', 'second', 'sentence.']]

        # Penn treebank-style tokenization
        >>> graphlab.text_analytics.tokenize(docs, delimiters=None)
        dtype: list
        Rows: 2
        [['This', 'is', 'the', 'first', 'sentence', '.'],
         ['This', 'one', ',', 'it', "'s", 'the', 'second', 'sentence', '.']]

    """
    _mt._get_metric_tracker().track('toolkit.text_analytics.tokenize')

    _raise_error_if_not_sarray(sa, "sa")

    ## Compute word counts
    sf = _graphlab.SFrame({'docs': sa})
    fe = _graphlab.feature_engineering.Tokenizer(features='docs',
                                                 to_lower=to_lower,
                                                 delimiters=delimiters,
                                                 output_column_prefix=None)
    tokens = fe.fit_transform(sf)

    return tokens['docs']
Example #13
0
def count_words(sa, to_lower=True,
                delimiters=["\r", "\v", "\n", "\f", "\t", " "]):
    """
    Convert the content of string type SArrays to a dictionary of (word, count)
    pairs. Dictionary keys and list elements must be strings.  The strings are
    first tokenized into words according to the specified `to_lower` and
    `delimiters` options. Then, word counts are accumulated.  In each output
    dictionary, the keys are the words in the corresponding input data entry,
    and the values are the number of times the words appears.  By default,
    words are split on all whitespace and newline characters. The output is
    commonly known as the "bag-of-words" representation of text data.

    Parameters
    ----------
    sa : SArray[str]
        Input data to be tokenized and counted. See
        :py:class:`~graphlab.toolkits.feature_engineering._word_counter.WordCounter`
        documentation for details on how string, dict, and list inputs are handled.

    to_lower : bool, optional
        If True, all strings are converted to lower case before counting.

    delimiters : list[str], None, optional
        Input strings are tokenized using delimiter characters in this list.
        Each entry in this list must contain a single character. If set to
        `None`, then a Penn treebank-style tokenization is used, which contains
        smart handling of punctuations.

    Returns
    -------
    out : SArray[dict]
        Each entry contains a dictionary with the frequency count of each word
        in the corresponding input entry.

    See Also
    --------
    count_ngrams, tf_idf, tokenize,
    graphlab.toolkits.feature_engineering._word_counter.WordCounter

    References
    ----------
    - `Bag of words model <http://en.wikipedia.org/wiki/Bag-of-words_model>`_

    Examples
    --------
    .. sourcecode:: python

        >>> import graphlab

        # Create input data
        >>> sa = graphlab.SArray(["The quick brown fox jumps.",
        ...                       "Word word WORD, word!!!word"])

        # Run count_words
        >>> graphlab.text_analytics.count_words(sa)
        dtype: dict
        Rows: 2
        [{'quick': 1, 'brown': 1, 'the': 1, 'fox': 1, 'jumps.': 1},
         {'word,': 1, 'word!!!word': 1, 'word': 2}]
    """

    _mt._get_metric_tracker().track('toolkit.text_analytics.count_words')
    _raise_error_if_not_sarray(sa, "sa")
    return _graphlab.extensions._count_words(sa, to_lower, delimiters)