コード例 #1
0
 def __init__(self, doc):
     if isinstance(doc, SpacyDoc):
         lang = doc.vocab.lang
         self.n_sents = sum(1 for _ in doc.sents)
     elif isinstance(doc, textacy.Doc):
         lang = doc.lang
         self.n_sents = doc.n_sents
     else:
         raise ValueError(
             '``doc`` must be a ``textacy.Doc`` or ``spacy.Doc``')
     # get objs for basic count computations
     hyphenator = data.load_hyphenator(lang=lang)
     words = tuple(
         extract.words(doc,
                       filter_punct=True,
                       filter_stops=False,
                       filter_nums=False))
     syllables_per_word = tuple(
         len(hyphenator.positions(word.lower_)) + 1 for word in words)
     chars_per_word = tuple(len(word) for word in words)
     # compute basic counts needed for most readability stats
     self.n_words = len(words)
     self.n_unique_words = len({word.lower for word in words})
     self.n_chars = sum(chars_per_word)
     self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7)
     self.n_syllables = sum(syllables_per_word)
     self.n_monosyllable_words = sum(1 for spw in syllables_per_word
                                     if spw == 1)
     self.n_polysyllable_words = sum(1 for spw in syllables_per_word
                                     if spw >= 3)
コード例 #2
0
def readability_stats(doc):
    """
    Get calculated values for a variety of statistics related to the "readability"
    of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index,
    Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index.

    Also includes constituent values needed to compute the stats, e.g. word count.

    **DEPRECATED**

    Args:
        doc (:class:`textacy.Doc <textacy.document.Doc>`)

    Returns:
        dict: mapping of readability statistic name (str) to value (int or float)

    Raises:
        NotImplementedError: if ``doc`` is not English language. sorry.
    """
    msg = '`readability_stats()` function is deprecated; use `TextStats` class instead'
    with warnings.catch_warnings():
        warnings.simplefilter('once', DeprecationWarning)
        warnings.warn(msg, DeprecationWarning)

    if doc.lang != 'en':
        raise NotImplementedError('non-English NLP is not ready yet, sorry')

    n_sents = doc.n_sents

    words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False))
    n_words = len(words)
    if n_words == 0:
        logging.warning("readability stats can't be computed because doc has 0 words")
        return None
    n_unique_words = len({word.lower for word in words})
    n_chars = sum(len(word) for word in words)

    hyphenator = data.load_hyphenator(lang='en')
    syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words]
    n_syllables = sum(syllables_per_word)
    n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3)

    return {'n_sents': n_sents,
            'n_words': n_words,
            'n_unique_words': n_unique_words,
            'n_chars': n_chars,
            'n_syllables': n_syllables,
            'n_polysyllable_words': n_polysyllable_words,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents),
            'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents),
            'smog_index': smog_index(n_polysyllable_words, n_sents),
            'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents),
            'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents),
            'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
コード例 #3
0
ファイル: text_stats.py プロジェクト: EricSchles/textacy
def readability_stats(doc):
    """
    Get calculated values for a variety of statistics related to the "readability"
    of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index,
    Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index.

    Also includes constituent values needed to compute the stats, e.g. word count.

    Args:
        doc (:class:`texts.TextDoc() <textacy.texts.TextDoc>`)

    Returns:
        dict: mapping of readability statistic name (str) to value (int or float)

    Raises:
        NotImplementedError: if ``doc`` is not English language. sorry.
    """
    if doc.lang != 'en':
        raise NotImplementedError('non-English NLP is not ready yet, sorry')

    n_sents = doc.n_sents

    words = list(doc.words(filter_punct=True, filter_stops=False, filter_nums=False))
    n_words = len(words)
    n_unique_words = len({word.lower for word in words})
    n_chars = sum(len(word) for word in words)

    hyphenator = data.load_hyphenator(lang='en')
    syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words]
    n_syllables = sum(syllables_per_word)
    n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3)

    return {'n_sents': n_sents,
            'n_words': n_words,
            'n_unique_words': n_unique_words,
            'n_chars': n_chars,
            'n_syllables': n_syllables,
            'n_polysyllable_words': n_polysyllable_words,
            'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents),
            'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents),
            'smog_index': smog_index(n_polysyllable_words, n_sents),
            'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents),
            'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents),
            'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
コード例 #4
0
def readability_stats(doc):
    """
    Get calculated values for a variety of statistics related to the "readability"
    of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index,
    Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index.

    Also includes constituent values needed to compute the stats, e.g. word count.

    Args:
        doc (:class:`texts.TextDoc() <textacy.texts.TextDoc>`)

    Returns:
        dict: mapping of readability statistic name (str) to value (int or float)

    Raises:
        NotImplementedError: if ``doc`` is not English language. sorry.
    """
    if doc.lang != 'en':
        raise NotImplementedError('non-English NLP is not ready yet, sorry')

    n_sents = doc.n_sents

    words = list(
        doc.words(filter_punct=True, filter_stops=False, filter_nums=False))
    n_words = len(words)
    n_unique_words = len({word.lower for word in words})
    n_chars = sum(len(word) for word in words)

    hyphenator = data.load_hyphenator(lang='en')
    syllables_per_word = [
        len(hyphenator.positions(word.lower_)) + 1 for word in words
    ]
    n_syllables = sum(syllables_per_word)
    n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3)

    return {
        'n_sents':
        n_sents,
        'n_words':
        n_words,
        'n_unique_words':
        n_unique_words,
        'n_chars':
        n_chars,
        'n_syllables':
        n_syllables,
        'n_polysyllable_words':
        n_polysyllable_words,
        'flesch_kincaid_grade_level':
        flesch_kincaid_grade_level(n_syllables, n_words, n_sents),
        'flesch_readability_ease':
        flesch_readability_ease(n_syllables, n_words, n_sents),
        'smog_index':
        smog_index(n_polysyllable_words, n_sents),
        'gunning_fog_index':
        gunning_fog_index(n_words, n_polysyllable_words, n_sents),
        'coleman_liau_index':
        coleman_liau_index(n_chars, n_words, n_sents),
        'automated_readability_index':
        automated_readability_index(n_chars, n_words, n_sents)
    }