def __init__(self, doc): if isinstance(doc, SpacyDoc): lang = doc.vocab.lang self.n_sents = sum(1 for _ in doc.sents) elif isinstance(doc, textacy.Doc): lang = doc.lang self.n_sents = doc.n_sents else: raise ValueError( '``doc`` must be a ``textacy.Doc`` or ``spacy.Doc``') # get objs for basic count computations hyphenator = data.load_hyphenator(lang=lang) words = tuple( extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) syllables_per_word = tuple( len(hyphenator.positions(word.lower_)) + 1 for word in words) chars_per_word = tuple(len(word) for word in words) # compute basic counts needed for most readability stats self.n_words = len(words) self.n_unique_words = len({word.lower for word in words}) self.n_chars = sum(chars_per_word) self.n_long_words = sum(1 for cpw in chars_per_word if cpw >= 7) self.n_syllables = sum(syllables_per_word) self.n_monosyllable_words = sum(1 for spw in syllables_per_word if spw == 1) self.n_polysyllable_words = sum(1 for spw in syllables_per_word if spw >= 3)
def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. **DEPRECATED** Args: doc (:class:`textacy.Doc <textacy.document.Doc>`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if ``doc`` is not English language. sorry. """ msg = '`readability_stats()` function is deprecated; use `TextStats` class instead' with warnings.catch_warnings(): warnings.simplefilter('once', DeprecationWarning) warnings.warn(msg, DeprecationWarning) if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = list(extract.words(doc, filter_punct=True, filter_stops=False, filter_nums=False)) n_words = len(words) if n_words == 0: logging.warning("readability stats can't be computed because doc has 0 words") return None n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return {'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. Args: doc (:class:`texts.TextDoc() <textacy.texts.TextDoc>`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if ``doc`` is not English language. sorry. """ if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = list(doc.words(filter_punct=True, filter_stops=False, filter_nums=False)) n_words = len(words) n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [len(hyphenator.positions(word.lower_)) + 1 for word in words] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return {'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents)}
def readability_stats(doc): """ Get calculated values for a variety of statistics related to the "readability" of a text: Flesch-Kincaid Grade Level, Flesch Reading Ease, SMOG Index, Gunning-Fog Index, Coleman-Liau Index, and Automated Readability Index. Also includes constituent values needed to compute the stats, e.g. word count. Args: doc (:class:`texts.TextDoc() <textacy.texts.TextDoc>`) Returns: dict: mapping of readability statistic name (str) to value (int or float) Raises: NotImplementedError: if ``doc`` is not English language. sorry. """ if doc.lang != 'en': raise NotImplementedError('non-English NLP is not ready yet, sorry') n_sents = doc.n_sents words = list( doc.words(filter_punct=True, filter_stops=False, filter_nums=False)) n_words = len(words) n_unique_words = len({word.lower for word in words}) n_chars = sum(len(word) for word in words) hyphenator = data.load_hyphenator(lang='en') syllables_per_word = [ len(hyphenator.positions(word.lower_)) + 1 for word in words ] n_syllables = sum(syllables_per_word) n_polysyllable_words = sum(1 for n in syllables_per_word if n >= 3) return { 'n_sents': n_sents, 'n_words': n_words, 'n_unique_words': n_unique_words, 'n_chars': n_chars, 'n_syllables': n_syllables, 'n_polysyllable_words': n_polysyllable_words, 'flesch_kincaid_grade_level': flesch_kincaid_grade_level(n_syllables, n_words, n_sents), 'flesch_readability_ease': flesch_readability_ease(n_syllables, n_words, n_sents), 'smog_index': smog_index(n_polysyllable_words, n_sents), 'gunning_fog_index': gunning_fog_index(n_words, n_polysyllable_words, n_sents), 'coleman_liau_index': coleman_liau_index(n_chars, n_words, n_sents), 'automated_readability_index': automated_readability_index(n_chars, n_words, n_sents) }