Ejemplo n.º 1
0
def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
    """
    A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
    contained in the entities argument.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_entities_document(text,ents)
    >>> tokenized_text == [
    ...     u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
    True
    """
    result = []
    for np in TextBlob(text).noun_phrases:
        if np in entities:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result
Ejemplo n.º 2
0
def _collect_bigrams_and_trigrams(raw_corpus,
                                  top_n=10000,
                                  min_length=1,
                                  min_freqs=None,
                                  stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text,
                                  min_length=min_length,
                                  stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [
        ' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)
    ]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
Ejemplo n.º 3
0
def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
    """
    A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_mixed_document(text,ents)
    >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn',
    ... u'brony', u'prancercise', u'class', u'prancercise', u'popular',
    ... u'pastime', u'sassy_unicorns']
    True
    """
    result = []
    for np in TextBlob(text).noun_phrases:
        if ' ' in np and np not in entities:
            # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
            result.extend(
                _simple_document(np,
                                 min_length=min_length,
                                 stopwords=stopwords))
        else:
            # filter out stop words
            tmp = "_".join(
                _simple_document(np,
                                 min_length=min_length,
                                 stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result
Ejemplo n.º 4
0
def _tokenize_mixed_document(text, entities, min_length=1, stopwords=None):
    """
    A text tokenizer that retrieves entities ('noun phrases') first and simple words for the rest of the text.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_mixed_document(text,ents)
    >>> tokenized_text == [u'frank', u'swank_tank', u'sassy', u'unicorn',
    ... u'brony', u'prancercise', u'class', u'prancercise', u'popular',
    ... u'pastime', u'sassy_unicorns']
    True
    """
    result = []
    for np in TextBlob(text).noun_phrases:
        if " " in np and np not in entities:
            # break apart the noun phrase; it does not occur often enough in the collection of text to be considered.
            result.extend(_simple_document(np, min_length=min_length, stopwords=stopwords))
        else:
            # filter out stop words
            tmp = "_".join(_simple_document(np, min_length=min_length, stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result
Ejemplo n.º 5
0
def _collect_bigrams_and_trigrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams are triplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    from nltk.collocations import TrigramCollocationFinder
    from nltk.metrics import BigramAssocMeasures, TrigramAssocMeasures

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)
    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)
    tcf = TrigramCollocationFinder.from_words(iter(words))

    bcf = tcf.bigram_finder()
    bcf.apply_freq_filter(min_freqs[0])
    bigrams = [' '.join(w) for w in bcf.nbest(BigramAssocMeasures.pmi, top_n)]

    tcf.apply_freq_filter(min_freqs[1])
    trigrams = [' '.join(w) for w in tcf.nbest(TrigramAssocMeasures.chi_sq, top_n)]

    bigrams_patterns = re.compile('(%s)' % '|'.join(bigrams), re.UNICODE)
    trigrams_patterns = re.compile('(%s)' % '|'.join(trigrams), re.UNICODE)

    return bigrams_patterns, trigrams_patterns
Ejemplo n.º 6
0
def _collect_ngrams(raw_corpus,
                    top_n=10000,
                    min_length=1,
                    min_freqs=None,
                    stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams/quadgrams are triplets/quadruplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text,
                                  min_length=min_length,
                                  stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)

    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)

    words_iterators = itertools.tee(words, 3)
    bigrams_patterns = _get_bigrams(words_iterators[0], top_n, min_freqs[0])
    trigrams_patterns = _get_trigrams(words_iterators[1], top_n, min_freqs[1])
    quadgrams_patterns = _get_quadgrams(words_iterators[2], top_n,
                                        min_freqs[2])

    return (bigrams_patterns, trigrams_patterns, quadgrams_patterns)
Ejemplo n.º 7
0
def _collocation_document(text, patterns, min_length=1, stopwords=None):
    """A text tokenizer that includes collocations(bigrams and trigrams).

    A collocation is sequence of words or terms that co-occur more often
    than would be expected by chance.  This function breaks a raw document
    up into tokens based on a pre-established collection of bigrams and
    trigrams.  This collection is derived from a body of many documents, and
    must be obtained in a prior step using the collect_bigrams_and_trigrams
    function.

    Uses nltk.collocations.TrigramCollocationFinder to
    find trigrams and bigrams.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    patterns: tuple of compiled regex object to find n-grams
        Obtained from collect_bigrams_and_trigrams function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_bigrams_and_trigrams(sample_corpus, min_freqs=[2, 2])
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _collocation_document(text,patterns)
    >>> tokenized_text == [
    ...     u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
    True
    """
    text = ' '.join(
        _simple_document(text, min_length=min_length, stopwords=stopwords))
    for pattern in patterns:
        text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'),
                      text)
    return text.split()
Ejemplo n.º 8
0
def _collect_ngrams(raw_corpus, top_n=10000, min_length=1, min_freqs=None, stopwords=None):
    """collects bigrams and trigrams from collection of documents.  Input to collocation tokenizer.

    bigrams are pairs of words that recur in the collection; trigrams/quadgrams are triplets/quadruplets.

    Parameters
    ----------
    raw_corpus : iterable of tuple of (doc_id(str/int), doc_text(str))
        body of documents to examine
    top_n : int
        limit results to this many entries
    min_length : int
        Minimum length of any single word
    min_freqs : iterable of int
        threshold of when to consider a pair of words as a recognized n-gram,
        starting with bigrams.
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
    >>> patterns[0].pattern
    u'(frank swank|swank tank|sassy unicorns)'
    >>> patterns[1].pattern
    u'(frank swank tank)'
    """

    # generator of documents, turn each element to its list of words
    doc_texts = (_simple_document(doc_text, min_length=min_length, stopwords=stopwords)
                 for doc_id, doc_text in raw_corpus)

    # generator, concatenate (chain) all words into a single sequence, lazily
    words = itertools.chain.from_iterable(doc_texts)

    words_iterators = itertools.tee(words, 3)
    bigrams_patterns = _get_bigrams(words_iterators[0], top_n, min_freqs[0])
    trigrams_patterns = _get_trigrams(words_iterators[1], top_n, min_freqs[1])
    quadgrams_patterns = _get_quadgrams(words_iterators[2], top_n, min_freqs[2])

    return (bigrams_patterns, trigrams_patterns, quadgrams_patterns)
Ejemplo n.º 9
0
def _collocation_document(text, patterns, min_length=1, stopwords=None):
    """A text tokenizer that includes collocations(bigrams and trigrams).

    A collocation is sequence of words or terms that co-occur more often
    than would be expected by chance.  This function breaks a raw document
    up into tokens based on a pre-established collection of bigrams, trigrams,
    and trigrams.  This collection is derived from a body of many documents, and
    must be obtained in a prior step using the collect_ngrams
    function.

    Uses nltk.collocations.(Bi/Tri/Quad)gramCollocationFinder to
    find bigrams/trigrams/quadgrams.

    Parameters
    ----------
    text : str
        A single document's text to be tokenized
    patterns: tuple of compiled regex object to find n-grams
        Obtained from collect_ngrams function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> patterns = _collect_ngrams(sample_corpus, min_freqs=[2, 2, 2])
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _collocation_document(text,patterns)
    >>> tokenized_text == [
    ...     u'frank_swank', u'tank', u'walked', u'sassy', u'unicorn', u'brony',
    ...     u'prancercise', u'class', u'daily', u'prancercise', u'tremendously',
    ...     u'popular', u'pastime', u'sassy_unicorns', u'retirees', u'alike']
    True
    """
    text = ' '.join(_simple_document(text, min_length=min_length, stopwords=stopwords))
    for pattern in patterns:
        text = re.sub(pattern, lambda match: match.group(0).replace(' ', '_'), text)
    return text.split()
Ejemplo n.º 10
0
def _tokenize_entities_document(text, entities, min_length=1, stopwords=None):
    '''
    A text tokenizer that passes only terms (a.k.a. 'entities') explicitly
    contained in the entities argument.

    Parameters
    ----------
    text : str
        A single text document to be tokenized
    entities : iterable of str
        Collection of noun phrases, obtained from collect_entities function
    min_length : int
        Minimum length of any single word
    stopwords : None or iterable of str
        Collection of words to ignore as tokens

    Examples
    --------
    >>> ents = _collect_entities(sample_corpus)
    >>> text = sample_corpus[0][1]
    >>> tokenized_text = _tokenize_entities_document(text,ents)
    >>> tokenized_text == [
    ...     u'frank', u'swank_tank', u'prancercise', u'sassy_unicorns']
    True
    '''
    result = []
    for np in TextBlob(text).noun_phrases:
        if np in entities:
            # filter out stop words
            tmp = "_".join(
                _simple_document(np,
                                 min_length=min_length,
                                 stopwords=stopwords))
            # if we end up with nothing, don't append an empty string
            if tmp:
                result.append(tmp)
    return result
Ejemplo n.º 11
0
def test__simple_document():
    assert(_simple_document(sample_data[0][1]) == ["frank", "frank", "frank",
                                                   "dog", "cat"])