Example #1
0
def analyzeSubLevel(input_text: str):
    """
    :Returns: highest CEFR of word in input_text, flesh_kincade_level, number of words
    """

    if (not (isinstance(input_text, str)) or (len(input_text) <= 0)):
        return ""

    if (this.cefr_data is None):
        this.cefr_data = loadCefrList()

    # TBD make static
    if (this.spacy_en is None):
        this.spacy_en = loadSpacyLangEn()

    # normalize text with NLP
    input_text = processText(input_text)

    doc = textacy.make_spacy_doc(input_text, lang=this.spacy_en)
    ts = textacy.TextStats(doc)

    flesh_kincade_level = calcFleshKincadeGrade(ts.n_words,
                                                ts.n_syllables / ts.n_words)

    # store words of text lowercase in list
    words: list = [item.lower() for item in input_text.split()]
    max_level = getMaxWordLevelForWordsSet(set(words), this.cefr_data)

    return max_level, flesh_kincade_level, ts.n_words
Example #2
0
def textacy_featurize(transcript):
    features=list()
    labels=list()

    # use Spacy doc
    try:
        doc = textacy.make_spacy_doc(transcript)
    except:
        os.system('python3 -m spacy download en')
        doc = textacy.make_spacy_doc(transcript)
    
    ts = textacy.TextStats(doc)
    uniquewords=ts.n_unique_words
    features.append(uniquewords)
    labels.append('uniquewords')

    mfeatures=ts.basic_counts
    features=features+list(mfeatures.values())
    labels=labels+list(mfeatures)

    kincaid=ts.flesch_kincaid_grade_level
    features.append(kincaid)
    labels.append('flesch_kincaid_grade_level')

    readability=ts.readability_stats
    features=features+list(readability.values())
    labels=labels+list(readability)
    
    return features, labels
def calculate_text_stats(books, overwrite):
    """
    Calculates the text stats of each book. Uses Textacy
    :param books: list of Book objects
    :param overwrite: flag that indicates if files that already exist should be overwritten
    """
    output_file = constants.PROCESSED_DATA_DIR / constants.TEXT_STATS_CSV_FILENAME

    if not os.path.exists(output_file) or overwrite:
        text_stats_list = list()

        nlp = spacy.load(constants.MODEL_DIR, disable=["tagger", "ner", "tokenizer", "textcat"])
        for book in books:
            LOGGER.info('Calculate TextStats %s', book.title)
            text = book.content()
            nlp.max_length = len(text)
            doc = nlp(text)
            text_stats = textacy.TextStats(doc)
            book_properties = dict()
            book_properties['book'] = book.title
            book_properties.update(text_stats.readability_stats)
            book_properties.update(text_stats.basic_counts)
            text_stats_list.append(book_properties)

        text_df = pd.DataFrame(text_stats_list)
        text_df.to_csv(output_file, index=False, encoding='utf-8')
Example #4
0
def ts_es():
    text = (
        "Muchos años después, frente al pelotón de fusilamiento, el coronel Aureliano "
        "Buendía había de recordar aquella tarde remota en que su padre lo llevó a "
        "conocer el hielo. Macondo era entonces una aldea de veinte casas de barro y "
        "cañabrava construidas a la orilla de un río de aguas diáfanas que se precipitaban "
        "por un lecho de piedras pulidas, blancas y enormes como huevos prehistóricos. "
        "El mundo era tan reciente, que muchas cosas carecían de nombre, y para "
        "mencionarlas había que señalarlas con el dedo.")
    return textacy.TextStats(textacy.make_spacy_doc(text, lang="es"))
Example #5
0
def ts_en():
    text = (
        "Many years later, as he faced the firing squad, Colonel Aureliano Buendía was "
        "to remember that distant afternoon when his father took him to discover ice. "
        "At that time Macondo was a village of twenty adobe houses, built on the bank "
        "of a river of clear water that ran along a bed of polished stones, which were "
        "white and enormous, like prehistoric eggs. The world was so recent that many "
        "things lacked names, and in order to indicate them it was necessary to point."
    )
    return textacy.TextStats(textacy.make_spacy_doc(text, lang="en"))
Example #6
0
    def wordTuples(graph, textEntry):
        text = rootify(graph, textEntry)
        pt = textacy.load_spacy('pt')
        doc = textacy.Doc(text, lang=pt)
        ts = textacy.TextStats(doc)
        words = [{
            w[0]: w[1]
        } for w in textacy.keyterms.textrank(
            doc, normalize='lower', n_keyterms=ts.n_unique_words)]

        return words
Example #7
0
    def get_readability_score_per_doc(self):
        """
        Calculates the flesch reading ease score (different values for german & english!) for each document:
        English formula:
            https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
        German formula:
            https://de.wikipedia.org/wiki/Lesbarkeitsindex#Flesch-Reading-Ease

        Returns
        -------
        List[float]
            all scores for each document
        """
        return [textacy.TextStats(doc).flesch_reading_ease for doc in self.corpus]
Example #8
0
    def complexity(self):
        """
        Determine the complexity of text using the Flesch
        reading ease test ranging from 0.0 - 100.0 with 0.0
        being the most difficult to read.

        >>> doc = Doc('Test sentence for testing text')
        >>> doc.complexity
        83.32000000000004
        """
        if not self._text_stats:
            self._text_stats = textacy.TextStats(self._spacy_doc)
        if self._text_stats.n_syllables == 0:
            return 100
        return self._text_stats.flesch_reading_ease
Example #9
0
    def test_text_stats_functionality(self):
        ts = textacy.TextStats(self.doc)

        self.assertIsInstance(ts.n_words, int)
        self.assertIsInstance(ts.flesch_kincaid_grade_level, float)

        basic_counts = ts.basic_counts
        self.assertIsInstance(basic_counts, dict)
        for field in ('n_chars', 'n_words', 'n_sents'):
            self.assertIsInstance(basic_counts.get(field), int)

        readability_stats = ts.readability_stats
        self.assertIsInstance(readability_stats, dict)
        for field in ('flesch_kincaid_grade_level', 'automated_readability_index', 'wiener_sachtextformel'):
            self.assertIsInstance(readability_stats.get(field), float)
nostril and the eye in a large "pit" on each side of the head. Other 
infrared-sensitive snakes have multiple, smaller labial pits lining the 
upper lip, just below the nostrils.

Snakes use smell to track their prey. They smell by using their forked 
tongues to collect airborne particles, then passing them to the vomeronasal organ 
or Jacobson's organ in the mouth for examination. The fork in the tongue gives 
snakes a sort of directional sense of smell and taste simultaneously. 
They keep their tongues constantly in motion, sampling particles from the air, 
ground, and water, analyzing the chemicals found, and determining the presence 
of prey or predators in the local environment. In water-dwelling snakes, such 
as the anaconda, the tongue functions efficiently underwater.""")

print('--------------------------------------------------')
print('TEXT')
print('--------------------------------------------------')
print(text)

doc = textacy.Doc(text)
print('--------------------------------------------------')
print('KEY TERMS')
print('--------------------------------------------------')
pprint.pprint(
    textacy.keyterms.sgrank(
        doc, ngrams=(1, 2, 3, 4), normalize='lower', n_keyterms=0.1))

ts = textacy.TextStats(doc)
print('--------------------------------------------------')
print('READABILITY')
print('--------------------------------------------------')
pprint.pprint(ts.readability_stats)
Example #11
0
def get_passage_scores(passage):
    doc = textacy.Doc(passage, lang=u"en_core_web_sm")
    textstats = textacy.TextStats(doc)
    readability = textstats.flesch_readability_ease
    polarity, subjectivity = textblob.TextBlob(passage).sentiment
    return polarity, subjectivity, readability
Example #12
0
 def get_readability(self):
     stats = textacy.TextStats(self.article_model)
     # Winer Sachtextformel https://de.wikipedia.org/wiki/Lesbarkeitsindex#Wiener_Sachtextformel
     self.readability = stats.wiener_sachtextformel