Beispiel #1
0
def most_verbose(data_word):
    """Finds long multi-syllable words and outputs a dataframe with values."""

    verbose_words = []
    synonyms = []

    #looping through words to find complex words and their synonyms
    for word in data_word:

        #finding complex words and recording word & lemma
        if textstat.syllable_count(word) > 3:

            word_syn = wordnet.synsets(word)
            lemmas = list(
                chain.from_iterable([word.lemma_names() for word in word_syn]))
            lemmas = [
                lemma for lemma in lemmas
                if textstat.syllable_count(lemma) <= 3
            ]

            verbose_words.append(word)
            synonyms.append(lemmas)

    #creating dataframe with data
    df_verbose = pd.DataFrame({
        'Word': verbose_words,
        'Synonyms': synonyms
    },
                              columns=['Word', 'Synonyms'])

    df_verbose.sort_values('Word', inplace=True)
    df_verbose.drop_duplicates(subset='Word', keep='first', inplace=True)
    return df_verbose
def count_fry_readability(data_list):
    sentence_numbers = 0
    syllables_numbers = 0
    # computation
    words_count = 0
    for sentence in data_list:
        sentence_numbers = sentence_numbers + 1
        syllables_numbers = syllables_numbers + textstat.syllable_count(sentence)
        words_count = words_count + textstat.lexicon_count(sentence)
        if words_count >= 150:
            break

    # computation
    words_count = 0
    for sentence in reversed(data_list):
        sentence_numbers = sentence_numbers + 1
        syllables_numbers = syllables_numbers + textstat.syllable_count(sentence)
        words_count = words_count + textstat.lexicon_count(sentence)
        if words_count >= 150:
            break

    avg_sentence_numbers = round(sentence_numbers / 3)
    avg_syllables_numbers = round(syllables_numbers / 3)

    return get_value_from_fry_graph(avg_sentence_numbers, avg_syllables_numbers)
Beispiel #3
0
def stat(data, data_word, data_sent):
    """Computes basic overview metrics and returns list of values"""
    #basic counts
    sent = len(data_sent)
    syll = textstat.syllable_count(data)
    word = len(data_word)

    #average calcs
    avg_syll = syll / word
    avg_word = word / sent
    read_time = word / 265

    #advance stat
    flesch_kincaid_grade = fkg(int(word), int(sent), int(syll))
    verbose = len(
        [word for word in data_word if textstat.syllable_count(word) > 3])

    wordy = 0
    for item in data_sent:
        token = word_tokenize(item)
        if len(token) > 40:
            wordy += 1
    #writing to list
    stats = [
        syll, word, sent, avg_syll, avg_word, read_time, flesch_kincaid_grade,
        verbose, wordy
    ]

    return stats
def syllable_table():
    lst_1 = [
        textstat.syllable_count(text) for text in insincere_questions.tolist()
    ]
    lst_2 = [
        textstat.syllable_count(text) for text in sincere_questions.tolist()
    ]
    table = build_table(lst_1, lst_2)
    py.plot(table, filename='syllable_table')
def build_syllable_dct(lst):
    dct = {}
    for t in lst:
        if textstat.syllable_count(t) in dct:
            dct[textstat.syllable_count(
                t)] = dct[textstat.syllable_count(t)] + 1
        else:
            dct[textstat.syllable_count(t)] = 1
    for key in dct.keys():
        dct[key] = dct[key] / len(lst)
    sorted_tuple = sorted(dct.items(), key=operator.itemgetter(0))
    return sorted_tuple
Beispiel #6
0
def other_features_(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features.

    This is modified to only include those features in the final
    model."""

    sentiment = sentiment_analyzer.polarity_scores(tweet)

    words = preprocess(tweet) #Get text only

    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))

    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)

    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['compound'],
                twitter_objs[2], twitter_objs[1],]
    #features = pandas.DataFrame(features)
    return features
def textstat_stats(text):
    doc_length = len(text.split()) 
    flesch_ease = ts.flesch_reading_ease(text) #Flesch Reading Ease Score
    flesch_grade = ts.flesch_kincaid_grade(text) #Flesch-Kincaid Grade Level
    gfog = ts.gunning_fog(text) # FOG index, also indicates grade level
#    smog = ts.smog_index(text) # SMOG index, also indicates grade level, only useful on 30+ sentences
    auto_readability = ts.automated_readability_index(text) #approximates the grade level needed to comprehend the text.
    cl_index = ts.coleman_liau_index(text) #grade level of the text using the Coleman-Liau Formula.
    lw_formula = ts.linsear_write_formula(text) #grade level using the Linsear Write Formula.
    dcr_score = ts.dale_chall_readability_score(text) #uses a lookup table of the most commonly used 3000 English words
#    text_standard = ts.text_standard(text, float_output=False) # summary of all the grade level functions
    syll_count = ts.syllable_count(text, lang='en_US')
    syll_count_scaled = syll_count / doc_length
    lex_count = ts.lexicon_count(text, removepunct=True)
    lex_count_scaled = lex_count / doc_length
    idx = ['flesch_ease', 'flesch_grade','gfog',
           'auto_readability','cl_index','lw_formula',
           'dcr_score', 
#           'text_standard', 
           'syll_count', 'lex_count']
    return pd.Series([flesch_ease, flesch_grade, gfog, 
                      auto_readability, cl_index, lw_formula, 
                      dcr_score, 
#                      text_standard, 
                      syll_count_scaled, lex_count_scaled], index = idx)
def feature_getter(text):
    try:
        text=text.decode('utf-8')
    except:
        pass
    text1=re.sub(r'[^\x00-\x7F]+',' ', text)
    ##text1=re.sub('\n','. ', text)
    text=text1
    features=[]
    tokens=[]
    sentences = nltk.sent_tokenize(text)
    [tokens.extend(nltk.word_tokenize(sentence)) for sentence in sentences]
    
    syllable_count = textstat.syllable_count(text, lang='en_US')
    word_count = textstat.lexicon_count(text, removepunct=True)

    flesch = textstat.flesch_reading_ease(text)
    readability = textstat.automated_readability_index(text)

    features.append(len(sentences)) #num_sentences
    features.append(syllable_count) #num_sentences
    features.append(word_count) #num_sentences
    features.append(flesch) #num_sentences
    features.append(readability) #num_sentences       
    return features
Beispiel #9
0
def fkg_over_text(data_sent):
    """Returns two lists of x and y points for an fkg graph"""
    if len(data_sent) >= 200:
        step = 40
    else:
        step = int(len(data_sent) / 10)

    y = []
    temp_fkg = []

    for count, sent in enumerate(data_sent, 1):

        temp_fkg.append(sent)

        if count >= step:

            words = [
                word for sent in temp_fkg for word in nltk.word_tokenize(sent)
            ]
            words = [word.lower() for word in words if word.isalpha()]

            word = len(words)

            syll = sum([textstat.syllable_count(word) for word in words])

            y.append(fkg(word, step, syll))
            temp_fkg = temp_fkg[1:]

    x = range(step, len(y) + step)
    return x, y
Beispiel #10
0
def most_wordy(data_sent):
    """Finds long sentences and outputs a dataframe with values."""
    #initialize lists
    sylls = []
    words = []
    sents = []
    fkgs = []

    #looping through sentences to find lengthy sentences
    for sent in data_sent:
        token = word_tokenize(sent)
        word = len(token)
        if word > 40:

            #appending to lists
            syll = textstat.syllable_count(sent)
            sylls.append(syll)
            words.append(word)
            sents.append(sent)
            fkgs.append(fkg(int(word), 1, int(syll)))

    #transfer information to dataframe
    df_wordy = pd.DataFrame(
        {
            'Words': words,
            'Syllables': sylls,
            'Flesch Kincaid Grade Level': fkgs,
            'Sentence': sents
        },
        columns=[
            "Words", "Syllables", "Flesch Kincaid Grade Level", "Sentence"
        ])
    df_wordy.sort_values("Words", ascending=False, inplace=True)
    return df_wordy
def flesch_kincaid(row):
    text = row['reviewText']
    words = max(1, textstat.lexicon_count(text))
    sentences = max(1, sentence_count(row))
    syllables = textstat.syllable_count(text, lang='en_US')
    score = 206.835 - 1.015 * (float(words) /
                               sentences) - 84.6 * (float(syllables) / words)
    return score
Beispiel #12
0
def syllableCount(word):
    # Check the Moby project first because robots are bad at english.
    # If it's not in the dictionary, ask Zoltar.
    if (masterSyllables.get(word)):
        return masterSyllables.get(word)
    elif (masterSyllables.get(word.lower())):
        return masterSyllables.get(word.lower())
    else:
        return textstat.syllable_count(word)
Beispiel #13
0
def get_raw_stats(book, text):
    return {
        'total_words': textstat.lexicon_count(text),
        'total_sentences': len(sent_tokenize(text)),
        'total_letters': textstat.letter_count(text),
        'total_syllables': textstat.syllable_count(text),
        # 'total_paragraphs': len(get_paragraphs(text)),
        # 'average_word_difficulty': get_average_frequency(book)
    }
def do_datas():
    # logging.info('do_datas')

    ########### Save text statistics
    ##### 1. nw 2. nvocab 3. nsyllable 4.nsentence 5. tone 6. readability
    ## 1. nw
    nw.append(len(words))
    ## 2. nvocab
    nvocab.append(len(vocab))
    ## 3. syllable
    n = textstat.syllable_count(contents)
    nsyllable.append(n)
    ## 4. sentence
    n = textstat.sentence_count(contents)
    nsentence.append(n)
    ## 5. tone
    ### LM dictionary
    n_neg_lm.append(count_occurrence(words, lm_neg))
    n_pos_lm.append(count_occurrence(words, lm_pos))
    n_uctt_lm.append(count_occurrence(words, lm_uctt))
    n_lit_lm.append(count_occurrence(words, lm_lit))
    n_cstr_lm.append(count_occurrence(words, lm_cstr))
    n_modal1_lm.append(count_occurrence(words, lm_modal1))
    n_modal2_lm.append(count_occurrence(words, lm_modal2))
    n_modal3_lm.append(count_occurrence(words, lm_modal3))
    n_negation_lm.append(count_negation(words, lm_pos, gt_negation))
    ### General Inquirer dictionary
    n_neg_gi.append(count_occurrence(words, gi_neg))
    n_pos_gi.append(count_occurrence(words, gi_pos))
    n_negation_gi.append(count_negation(words, gi_pos, gt_negation))
    ### Henry dictionary
    n_neg_hr.append(count_occurrence(words, hr_neg))
    n_pos_hr.append(count_occurrence(words, hr_pos))
    n_negation_hr.append(count_negation(words, gi_pos, gt_negation))
    ## 4. readability
    fre_i = textstat.flesch_reading_ease(contents)
    if fre_i > 100:
        fre_i = 100
    if fre_i < 0:
        fre_i = float('NaN')
    fre.append(fre_i)
    fkg_i = textstat.flesch_kincaid_grade(contents)
    if fkg_i < 0:
        fkg_i = float('NaN')
    fkg.append(fkg_i)
    # RIX
    cl_i = textstat.coleman_liau_index(contents)
    if cl_i < 0:
        cl_i = float('NaN')
    cl.append(cl_i)
    f = textstat.gunning_fog(contents)
    fog.append(f)
    f = textstat.automated_readability_index(contents)
    ari.append(f)
    f = textstat.smog_index(contents)
    smog.append(f)
Beispiel #15
0
def compute_readability_stats(text):
    """
    Compute reading statistics of the given text
    Reference: https://github.com/shivam5992/textstat

    Parameters
    ==========
    text: str, input section or abstract text
    """
    try:
        readability_dict = {
            'flesch_reading_ease':
            textstat.flesch_reading_ease(text),
            'smog':
            textstat.smog_index(text),
            'flesch_kincaid_grade':
            textstat.flesch_kincaid_grade(text),
            'coleman_liau_index':
            textstat.coleman_liau_index(text),
            'automated_readability_index':
            textstat.automated_readability_index(text),
            'dale_chall':
            textstat.dale_chall_readability_score(text),
            'difficult_words':
            textstat.difficult_words(text),
            'linsear_write':
            textstat.linsear_write_formula(text),
            'gunning_fog':
            textstat.gunning_fog(text),
            'text_standard':
            textstat.text_standard(text),
            'n_syllable':
            textstat.syllable_count(text),
            'avg_letter_per_word':
            textstat.avg_letter_per_word(text),
            'avg_sentence_length':
            textstat.avg_sentence_length(text)
        }
    except:
        readability_dict = {
            'flesch_reading_ease': None,
            'smog': None,
            'flesch_kincaid_grade': None,
            'coleman_liau_index': None,
            'automated_readability_index': None,
            'dale_chall': None,
            'difficult_words': None,
            'linsear_write': None,
            'gunning_fog': None,
            'text_standard': None,
            'n_syllable': None,
            'avg_letter_per_word': None,
            'avg_sentence_length': None
        }
    return readability_dict
Beispiel #16
0
    def _add_lemma(self, lm):
        tlid = self.lemma_name.setdefault(lm, self.lid)
    #     sid = max(self.lid, tlid+1)
        if tlid == self.lid:
            self.lemma.append((tlid, lm, textstat.syllable_count(lm)))  #, False))  - isCommon
            self.lid += 1

            # if lemma not already visited, visit all of its synset
            for ss in wn.synsets(lm):
                tssid = self._add_synset(ss)
                self.means.add((tlid, tssid))
        return tlid
def sentence_fit(gen_text, orig_text):
    df = pd.DataFrame(
        gen_text,
        columns=['generated'])  # Text generated from GPT2 stored in dataframe
    df['generated'] = df['generated'].str.replace(r' +,', ',').str.replace(
        r' +\.', '.')  # Remove spaces in front of punctuation
    df['similarity'] = df['generated'].apply(lambda x: text_similarity(
        orig_text, x))  # Assess cosine similarity betweeen sentences
    df['n_syll'] = df['generated'].apply(
        textstat.syllable_count)  # Count number of syllables
    df['n_lex'] = df['generated'].apply(
        textstat.lexicon_count)  # Count number of words
    df['syll_lex'] = df['n_syll'] / df['n_lex']  # Syllable to word ratio

    # Flags to indicate whether generated text has fewer words, syallables, or syll to word ratio
    df['rel_syll'] = np.where(
        df['n_syll'] < textstat.syllable_count(orig_text), 1, 0)
    df['rel_lex'] = np.where(df['n_lex'] < textstat.lexicon_count(orig_text),
                             1, 0)
    df['rel_rat'] = np.where(
        df['syll_lex'] <
        textstat.syllable_count(orig_text) / textstat.lexicon_count(orig_text),
        1, 0)

    # Sum binary indicators of relative sentence simplicity
    df['rel_simp'] = (df['rel_syll'] + df['rel_lex'] + df['rel_rat']) / 3

    # Fit score is weighted sum of similarity and relative sentence simplicity
    # Highest score will be chosen
    df['fit_score'] = 0.7 * df['similarity'] + 0.3 * df['rel_simp']

    # Subset data and rename columns
    df['Original'] = orig_text
    df = df[['Original', 'generated', 'similarity', 'rel_simp', 'fit_score']]
    df.columns = [
        'Original', 'Generated', 'Similarity', 'Simplicity', 'Fit Score'
    ]

    return df
def getSyllableCount(word):
    """
        A function to count syllables
        :param word: word whose syllables will be counted
        :return: number of syllables in the word
    """
    words = word.split(" ")
    count = 0

    for eachWord in words:
        count += textstat.syllable_count(eachWord)

    return count
def count_sentences_syllables(data_list):
    sentences_30_syllables = []
    sentences_20_syllables = []
    sentences_30_count = 0
    sentences_20_count = 0

    for sentence in data_list:
        count = sum([textstat.syllable_count(word) for word in sentence.split()])
        if count > 30:
            sentences_30_count += 1
            sentences_30_syllables.append((sentences_30_count, sentence))
        if count > 20 and count < 30:
            sentences_20_count += 1
            sentences_20_syllables.append((sentences_20_count, sentence))
    return sentences_30_syllables, sentences_30_count, sentences_20_syllables, sentences_20_count
Beispiel #20
0
def isValuableComment(clean_text):
    if clean_text != clean_text:
        return False
    print("clean text:", clean_text)
    if (len(clean_text) <= 10):
        return False

    readability_score = textstat.flesch_reading_ease(clean_text)
    syllable_count = textstat.syllable_count(clean_text)

    # Score of 80-90 == easy (from 100 (very easy to hard))
    # if (readability_score > 85):
    #     return False
    # print(readability_score)

    return True
def lisibilty(text):

    f_lis = ([
        textstat.syllable_count(str(text), lang='en_arabic'),
        textstat.lexicon_count(str(text), removepunct=True),
        textstat.sentence_count(str(text)),
        textstat.flesch_reading_ease(str(text)),
        textstat.flesch_kincaid_grade(str(text)),
        textstat.gunning_fog(str(text)),
        textstat.smog_index(str(text)),
        textstat.automated_readability_index(str(text)),
        textstat.coleman_liau_index(str(text)),
        textstat.linsear_write_formula(str(text)),
        textstat.dale_chall_readability_score(str(text))
    ])
    return f_lis
Beispiel #22
0
def get_word_stats(word):
    global sentiment_analyzer
    if sentiment_analyzer is None:
        sentiment_analyzer = SentimentIntensityAnalyzer()

    count_syllables = textstat.syllable_count(word)
    freq_score = zipf_frequency(word, "en")
    polarity = sentiment_analyzer.polarity_scores(word)
    stats = {
        "syllables": count_syllables,
        "freq_score": freq_score,
        "sentiment": 1 if polarity["pos"] else -1 if polarity["neg"] else 0,
        "sentiment_degree": polarity["compound"],
        "difficulty": (min(count_syllables, 6) * 5 // (1 + min(freq_score, 6)))
    }
    return stats
Beispiel #23
0
def syll_over_text(data_word):
    """Returns two lists of x and y points for a syllable per word graph"""

    step = 200
    y = []
    temp_syll = []

    for count, word in enumerate(data_word, 1):

        temp_syll.append(textstat.syllable_count(word))

        if count >= step:
            y.append(sum(temp_syll) / len(temp_syll))
            temp_syll = temp_syll[1:]

    x = range(step, len(y) + step)
    return x, y
def words_sentence_syllables(data_list):
    words_12_letters = []
    words_4_syllables = []
    words_12_count = 0
    words_4_count = 0

    for sentence in data_list:
        for word in sentence.split():
            count4 = textstat.syllable_count(word)
            count12 = textstat.letter_count(word)
            if count12 > 12:
                words_12_count += 1
                words_12_letters.append((count12, word))
            if count4 > 4:
                words_4_count += 1
                words_4_syllables.append((count4, word))
    return words_12_letters, words_12_count, words_4_syllables, words_4_count
Beispiel #25
0
def forcast(doc):
    """
    :param: doc object
    :returns: tuple with grade level, age level
    """
    word_tokens = doc.word_tokens
    monosyllables = 0

    for i in word_tokens:
        if i.isalpha() == False and len(i) < 2:
            word_tokens.remove(i)
    for i in word_tokens[10:159]:
        if syllable_count(i) < 2:
            monosyllables += 1

    gl = 20 - (monosyllables/10)
    ra = 25 - (monosyllables/10)
    return (gl, ra, monosyllables)
Beispiel #26
0
 def get_readability_features(self):
     sent_tokens = text_tokenizer(self.raw_text,
                                  replace_url_flag=True,
                                  tokenize_sent_flag=True)
     sentences = [' '.join(sent) + '\n' for sent in sent_tokens]
     sentences = ''.join(sentences)
     self.syllable_count = textstat.syllable_count(sentences)
     self.flesch_reading_ease = textstat.flesch_reading_ease(sentences)
     self.flesch_kincaid_grade = textstat.flesch_kincaid_grade(sentences)
     self.fog_scale = textstat.gunning_fog(sentences)
     self.smog = textstat.smog_index(sentences)
     self.automated_readability = textstat.automated_readability_index(
         sentences)
     self.coleman_liau = textstat.coleman_liau_index(sentences)
     self.linsear_write = textstat.linsear_write_formula(sentences)
     self.dale_chall_readability = textstat.dale_chall_readability_score(
         sentences)
     self.text_standard = textstat.text_standard(sentences)
def get_desc_data(string):
    '''
    Input: book description string
    Output: returns desc_len, num_unique_words, avg_word_len
    '''
    #Data before text processing
    desc_semantic = get_semantic(string)
    syl_count = syllable_count(string)
    lex_count = lexicon_count(string)
    sent_count = sentence_count(string)
    flesch = flesch_reading_ease(string)

    #Data after text processing
    string = text_preprocess(string)
    word_cnt = word_count(string)
    description_len = desc_len(string)
    number_unique_words = num_unique_words(string)
    average_word_len = avg_word_len(string)
    return desc_semantic, word_cnt, description_len, number_unique_words, \
           average_word_len, syl_count, lex_count, sent_count, flesch
Beispiel #28
0
    def test_syllable_count(self):
        count = textstat.syllable_count(self.long_test)

        self.assertEqual(521, count)
Beispiel #29
0
def test_syllable_count():
    count = textstat.syllable_count(long_test)

    assert count == 521
Beispiel #30
0
​
PDM Bugs Workflow
​
If a ticket Summary, Description, Steps to Reproduce or Expected Result are not clear enough for you, use the Feedback status and assign it back to the previous assignee. Be sure to leave a comment describing what you require to proceed.
If you can't proceed with the resolution of the ticket because of any other limitations, set it to Blocked status and give your reasons for doing so in the Comments. Also, remember to change Assignee to whoever you think is responsible for unblocking the issue or the Project Manager if you don't know who that might be.
​
If you decide to do so, you will be able to track the time tickets spend in Feedback and Blocked statuses here. You can set a custom timespan, desired ticket status, and project to see how you're progressing. 
​
​
Company Glossary - a concept, TBD with Docs Team
​
Ever had the impression during a conversation that both of you are talking about the same thing but you name it differently? That's common in growing businesses like our company. For example, definition of a device varies, or what's a view to someone is a section to someone else. Or my favourite: some people even refer to bugs as features.
​
The list is very long. Many times the differences stem from the Project customization and it's not possible to have alerts when the client requested faults. But some of them can be unified. Enter the Docs Team! Our Technical Writers are working on a glossary of business, technical and other terms used in our company so that we make sure we're all on the same page in terms of vocabulary.
​
​
"""

print(textstat.syllable_count(text))
print(
    f"The Flesch Reading Ease score is: {textstat.flesch_reading_ease(text)}")
print(
    f"The Flesch-Kincaid Grade level is: {textstat.flesch_kincaid_grade(text)}"
)
print(
    f"The Dale-Chall Readability Score is: {textstat.dale_chall_readability_score(text)}"
)
print(
    f"The readability consensus is: {textstat.text_standard(text, float_output=False)}"
)
Beispiel #31
0
def test_syllable_count():
    textstat.set_lang("en_US")
    count = textstat.syllable_count(long_test)

    assert count == 521