def readability_analysis(self, text): words = text.split() wrd_dic = {} for wrd in words: wrd = "".join(a for a in wrd if a not in punctuation) wrd_dic[wrd] = textstat.syllable_count(wrd) wrd_dic = [b for b in wrd_dic if wrd_dic[b] >= 5] flesch_reading_ease = textstat.flesch_reading_ease(text) if flesch_reading_ease > 100: flesch_reading_ease = 100 elif flesch_reading_ease < 0: flesch_reading_ease = 0 syllable_count = textstat.syllable_count(text) avg_syllables_per_word = textstat.avg_syllables_per_word(text) avg_letter_per_word = textstat.avg_letter_per_word(text) readability = { "flesch_reading_ease": flesch_reading_ease, "avg_syllables_per_word": avg_syllables_per_word, "syllable_count": syllable_count, "avg_letter_per_word": avg_letter_per_word, } grade, score = self.readability_grade(readability) readability['grade'] = grade readability['score'] = score readability['difficult_words'] = wrd_dic return readability
def synonym_replacement(input_string, copy_string): # loop through each token (word, comma, period) in the input file for word in word_tokenize(input_string): # find a set of synonyms for each word synonym_set = wordnet.synsets(word) # if there is a synonym for that word if synonym_set: #get the synonym with the lowest syllable count synonym = find_lowest_syl_count(synonym_set) # get just the first synonym #synonym = synonym_set[0].lemmas()[0].name() # if the synonym has less syllabes than the word if textstat.syllable_count(synonym) < textstat.syllable_count( word): words_with_synonyms.append(word) #view synonym changes #print(word, "-->", synonym, "\n") the_synonyms.append(synonym) # replace all the words with their corresponding synonyms i = 0 while i < len(words_with_synonyms): copy_string = [ w.replace(words_with_synonyms[i], the_synonyms[i]) for w in copy_string ] i += 1 # join the list into one string FINAL_STRING = " ".join(copy_string) return FINAL_STRING
def generatehaiku(url): authheader = "Basic " + base64.b64encode(os.environ['IMAGGA_API_KEY'] + ":" + os.environ['IMAGGA_API_SECRET']) headers = {'accept': "application/json", 'authorization':authheader} imaggaurl = "http://api.imagga.com/v1/tagging?url={}".format(url) r = requests.get(imaggaurl, headers=headers) imgtags = r.json()['results'][0]['tags'] tags = [] for tag in imgtags: tags.append([tag['tag'], int(textstat.syllable_count(tag['tag']))]) tagsbysyllable = {} for tag in tags: key = tag[1] value = tag[0] if key not in tagsbysyllable: tagsbysyllable[key] = list() tagsbysyllable[key].append(value) random.seed(url) haikuline1=nsyllables(5,tagsbysyllable) haikuline2=nsyllables(7, haikuline1[1]) haikuline3=nsyllables(5, haikuline2[1]) return render_template('haiku.html', url=url, haikuline1=haikuline1[0], haikuline2=haikuline2[0], haikuline3=haikuline3[0])
def text_analytics(text): if textstat.sentence_count(text) != 0: lexicon = textstat.lexicon_count(text) #word count sent = textstat.sentence_count(text) #sentence count syll = textstat.syllable_count(text) #syllable count flesch = textstat.flesch_reading_ease(text) #flesch score smog = textstat.smog_index(text) #SMOG index fog = textstat.gunning_fog(text) #FOG index dale = textstat.dale_chall_readability_score(text) #grade level ari = textstat.automated_readability_index(text) #grade level cl = textstat.coleman_liau_index(text) #grade level flesch1 = lexicon*flesch flesch2 = sent*flesch flesch3 = syll*flesch smog1 = lexicon*smog smog2 = sent*smog smog3 = syll*smog fog1 = lexicon*fog fog2 = sent*fog fog3 = syll*fog dale1 = lexicon*dale dale2 = sent*dale dale3=syll*dale ari1 = lexicon*ari ari2 = sent*ari ari3 = syll*ari cl1 = lexicon*cl cl2 = sent*cl cl3 = syll*cl x=[lexicon,sent,syll,flesch,smog,fog,dale,ari,cl,flesch1,flesch2,flesch3,smog1, smog2,smog3,fog1,fog2,fog3,dale1,dale2,dale3,ari1,ari2,ari3,cl1,cl2,cl3] return(x)
def get_special_metrics(text): blob = TextBlob(text) main = { "statistics": { "syllables": textstat.syllable_count(text), "words": textstat.lexicon_count(text), "characters": textstat.char_count(text), "polysyllables": textstat.polysyllabcount(text), "average letter per word": textstat.avg_letter_per_word(text), "average sentence length": textstat.avg_sentence_length(text), "average sentence per word": textstat.avg_sentence_per_word(text), "sentences": textstat.sentence_count(text), }, "difficulty": { "flesch reading ease": textstat.flesch_reading_ease(text), "smog index": textstat.smog_index(text), "flesch kincaid grade": textstat.flesch_kincaid_grade(text), "coleman liau index": textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), "gunning fog": textstat.gunning_fog(text), }, "sentiments": {"polarity": blob.sentiment.polarity, "subjectivity": blob.sentiment.subjectivity}, } return main
def calculate_statistics(lyrics): """ Calculates statistics based on the text_raw of the lyrics. :return: Annotated lyrics containing information about the songs """ logging.info("Calculating Statistics") from textstat.textstat import textstat for idx, song in tqdm(enumerate(lyrics), total=len(lyrics)): try: song["num_syllables"] = textstat.syllable_count(song["text_raw"]) song["num_words"] = textstat.lexicon_count(song["text_raw"]) song["num_sentences"] = textstat.sentence_count(song["text_raw"]) song["flesch_score"] = textstat.flesch_reading_ease( song["text_raw"]) song["flesch_kincaid_level"] = textstat.flesch_kincaid_grade( song["text_raw"]) song["fog_score"] = textstat.gunning_fog(song["text_raw"]) song[ "num_difficult_words"] = textstat.dale_chall_readability_score( song["text_raw"]) except Exception as e: logging.error( "Something bad happened in the current song ! Skipping it... \n{}" .format(song)) logging.exception(e) return lyrics
def _get_base_textstats(no_code_text): """ Find basic text statistics :param no_code_text: Text we are analyzing :return: list: List of results """ results = [] group_by = 'Basic Text Statistics' num_chars = len(no_code_text) num_lower = sum(1 for c in no_code_text if c.islower()) num_upper = sum(1 for c in no_code_text if c.isupper()) num_letters = sum(1 for c in no_code_text if c.isalpha()) num_numbers = sum(1 for c in no_code_text if c.isdigit()) num_alphanum = sum(1 for c in no_code_text if c.isalnum()) num_otherchars = num_chars - num_alphanum results.append(TextFeature('Number of characters', num_chars, group_by)) results.append(TextFeature('Number of letters', num_letters, group_by)) results.append(TextFeature('Number of numbers', num_numbers, group_by)) results.append(TextFeature('Number of other characters', num_otherchars, group_by)) character_counts = Counter(no_code_text.lower()) for c in sorted(character_counts.items()): try: results.append(TextFeature('Character count for "{}"'.format(c[0].encode('unicode_escape')), c[1], group_by)) except AttributeError: results.append(TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by)) results.append(TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by)) results.append(TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by)) results.append(TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by)) results.append(TextFeature('Number of lower case characters', num_lower, group_by)) results.append(TextFeature('Number of upper case characters', num_upper, group_by)) return results
def _schedule_words(text, rate): """Determine the time at which to speak each word for slow dictation. :returns list: A list of floats, starting from 0 and monotonically increasing, corresponding to the time at which to say each word. """ num_syllables = textstat.syllable_count(text) total_dictation_time = num_syllables * NUM_WORDS_PER_SYLLABLE / rate total_dictation_seconds = total_dictation_time * 60.0 words = text.split() time_per_word = total_dictation_seconds / len(text.split()) current_time = 0.0 for word in words: # There's no technical reason to round, it's just nice to not have to # worry about making a pretty string representation of the list of # timings. yield round(current_time, 2) word_length = time_per_word punctuation_pause = PUNCTUATION_PAUSES.get(word[-1]) if punctuation_pause: word_length += time_per_word * punctuation_pause current_time += word_length
def get_special_metrics(text): blob = TextBlob(text) main = { 'statistics': { 'syllables': textstat.syllable_count(text), 'words': textstat.lexicon_count(text), 'characters': textstat.char_count(text), 'polysyllables': textstat.polysyllabcount(text), 'average letter per word': textstat.avg_letter_per_word(text), 'average sentence length': textstat.avg_sentence_length(text), 'average sentence per word': textstat.avg_sentence_per_word(text), 'sentences': textstat.sentence_count(text) }, 'difficulty': { 'flesch reading ease': textstat.flesch_reading_ease(text), 'smog index': textstat.smog_index(text), 'flesch kincaid grade': textstat.flesch_kincaid_grade(text), 'coleman liau index': textstat.coleman_liau_index(text), #'automated readability index': textstat.automated_readability_index(text), #'dale chall readability score': textstat.dale_chall_readability_score(text), #'difficult words': textstat.difficult_words(text), #'linsear write formula': textstat.linsear_write_formula(text), 'gunning fog': textstat.gunning_fog(text) }, 'sentiments': { 'polarity': blob.sentiment.polarity, 'subjectivity': blob.sentiment.subjectivity } } return main
def haiku(text): words = text.split() for word in words: try: CMU_DICT[re.sub(r'[^\w\s]', '', word.lower())] except Exception as ex: #print ex return syllables = [ int(math.ceil(textstat.syllable_count(word))) for word in words ] if sum(syllables) != 17: return syl_line = [0, 0, 0] haiku_lines = ['', '', ''] for word, syllable_count in zip(words, syllables): if syl_line[0] < 5: syl_line[0] += syllable_count haiku_lines[0] += word + ' ' elif syl_line[0] is 5 and syl_line[1] < 7: syl_line[1] += syllable_count haiku_lines[1] += word + ' ' elif syl_line[0] is 5 and syl_line[1] is 7 and syl_line[2] < 5: syl_line[2] += syllable_count haiku_lines[2] += word + ' ' # ain't a haiku, if syl_line[0] > 5 or syl_line[2] > 5 or syl_line[1] > 7: return # If haiku return haiku as string if syl_line == [5, 7, 5]: return ('%s\n%s\n%s' % tuple(haiku_lines))[:-1]
def do_text_stats(self, text): ### Syllable Count syllable_count = textstat.syllable_count(text) ### Lexicon Count lexicon_count = textstat.lexicon_count(text, True) ### Sentence Count sentence_count = textstat.sentence_count(text) ### The Flesch Reading Ease formula try: flesch_reading_ease = textstat.flesch_reading_ease(text) except TypeError as e: flesch_reading_ease = None #* 90-100 : Very Easy #* 80-89 : Easy #* 70-79 : Fairly Easy #* 60-69 : Standard #* 50-59 : Fairly Difficult #* 30-49 : Difficult #* 0-29 : Very Confusing ### The The Flesch-Kincaid Grade Level try: flesch_kincaid_grade = textstat.flesch_kincaid_grade(text) except TypeError as e: flesch_kincaid_grade = None ## The Fog Scale (Gunning FOG Formula) gunning_fog = textstat.gunning_fog(text) ### The SMOG Index smog_index = textstat.smog_index(text) ### Automated Readability Index automated_readability_index = textstat.automated_readability_index( text) ### The Coleman-Liau Index try: coleman_liau_index = textstat.coleman_liau_index(text) except TypeError as e: coleman_liau_index = None ### Linsear Write Formula linsear_write_formula = textstat.linsear_write_formula(text) ### Dale-Chall Readability Score dale_chall_readability_score = textstat.dale_chall_readability_score( text) ### Readability Consensus based upon all the above tests try: text_standard = textstat.text_standard(text) except TypeError as e: text_standard = None return { "syllable_count": syllable_count, "lexicon_count": lexicon_count, "sentence_count": sentence_count, "flesch_reading_ease": flesch_reading_ease, "flesch_kincaid_grade": flesch_kincaid_grade, "gunning_fog": gunning_fog, "smog_index": smog_index, "automated_readability_index": automated_readability_index, "coleman_liau_index": coleman_liau_index, "linsear_write_formula": linsear_write_formula, "dale_chall_readability_score": dale_chall_readability_score, "text_standard": text_standard }
def flesch_kincaid_score(text): sylCount = textstat.syllable_count(text) wordCount = len(text.split()) sentenceCount = textstat.sentence_count(text) print "Syl count - %s, word count - %s, sentenceCount - %s " % (sylCount,wordCount,sentenceCount) return (0.39*(wordCount/sentenceCount)+11.8*(sylCount/wordCount) - 15.59)
def composition(text, file): char_count = textstat.char_count(text) syll_count = textstat.syllable_count(text) lex_count = textstat.lexicon_count(text) sent_count = textstat.sentence_count(text) file.write( '\nChar count : %d\nSyllabus count : %d \nLexicon count : %d \nSentence count : %d' % (char_count, syll_count, lex_count, sent_count))
def average_syllables_per_word(text): """ :type text: Text :param text: The text to be analysed :rtype float :returns Average syllables per word """ return textstat.syllable_count(text.text) / len(text.tokens_alphabetic)
def split_by_syllables(syls, words): s_count = 0 split = 0 while s_count < syls and split < len(words): s_count += round(textstat.syllable_count(words[split])) split += 1 if s_count != syls: raise ValueError("Words do not evenly split") return words[:split], words[split:]
def nsyl(word): ''' Return the number of syllables in word.''' try: res = [ len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()] ][0] except: res = np.round(textstat.syllable_count(word)) return res
def other_features_(tweet): """This function takes a string and returns a list of features. These include Sentiment scores, Text and Readability scores, as well as Twitter specific features. This is modified to only include those features in the final model.""" sentiment = sentiment_analyzer.polarity_scores(tweet) words = preprocess(tweet) #Get text only syllables = textstat.syllable_count(words) #count syllables in words num_chars = sum(len(w) for w in words) #num chars in words num_chars_total = len(tweet) num_terms = len(tweet.split()) num_words = len(words.split()) avg_syl = round(float((syllables + 0.001)) / float(num_words + 0.001), 4) num_unique_terms = len(set(words.split())) ###Modified FK grade, where avg words per sentence is just num words/1 FKRA = round( float(0.39 * float(num_words) / 1.0) + float(11.8 * avg_syl) - 15.59, 1) ##Modified FRE score, where sentence fixed to 1 FRE = round( 206.835 - 1.015 * (float(num_words) / 1.0) - (84.6 * float(avg_syl)), 2) twitter_objs = count_twitter_objs(tweet) #Count #, @, and http:// lyric = False #index=0 for t in tweets: new_t = preprocess(t) #print(new_t) for l in lyrics: l = preprocess(l) if new_t == l: #print(new_t) #print(tweets.iloc[index]) #df.loc[index, "class"]=2 lyric = True #df.set_value(index,'class',2) #df.to_csv("labeled_data.csv", index=False) #print("done", df["class"].iloc[index]) #index=index+1 features = [ FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words, num_unique_terms, sentiment['compound'], lyric ] #features = pandas.DataFrame(features) return features
def get_stats(sentence): syllables = textstat.syllable_count(sentence) words = textstat.lexicon_count(sentence, True) sentence_count = textstat.sentence_count(sentence) if sentence_count > 0: text_standard = textstat.text_standard(sentence) else: text_standard = EMPTY_TEXT_STANDARD text_standard = fix_grammar_errors(text_standard) return combine(syllables, words, sentence_count, text_standard)
def get_avg_syl_count(row, is_title): """ Function to get the average number of syllables per word. Here, row refers to the article being considered. Args: row: The row of data to be considered. In this case a row of a `pandas` `DataFrame`. is_title: A boolean value indicating whether or not this average syllable count is for the title. Returns: An average syllable count for the provided row. Raises: Additional errors may be thrown by dependencies. """ if is_title: syl = textstat.syllable_count(row.Title) return syl / row.titleWordCount syl = textstat.syllable_count(row.Body) return syl / row.wordCount
def flesch_kincaid(tokenized_sentences): total_syllables = 0 total_words = 0 total_sentences = len(tokenized_sentences) for tokenized_sent in tokenized_sentences: for token in tokenized_sent: total_words += 1 total_syllables += textstat.syllable_count(token) score = 206.835 score -= 1.015 * (total_words / total_sentences) score -= 84.6 * (total_syllables / total_words) return score / 100
def find_lowest_syl_count(syn_list): #12 is largest number of syllables in one english word lowest_count = 12 lowest_count_word = "" for synset in syn_list: word = synset.name().split('.')[0] count = textstat.syllable_count(word) if count < lowest_count: lowest_count = count lowest_count_word = word return lowest_count_word
def haiku_friend(): haiku = [] while True: line1 = raw_input("Enter the first line (5 syllables!) ") if math.ceil(textstat.syllable_count(line1)) == 5: haiku.append(line1) break else: print "Your line should contain exactly 5 syllables!" while True: line2 = raw_input("Enter the second line (7 syllables!) ") if math.ceil(textstat.syllable_count(line2)) == 7: haiku.append(line2) break else: print "Your line should contain exactly 7 syllables!" while True: line3 = raw_input("Enter the third line (5 syllables again!) ") if math.ceil(textstat.syllable_count(line3)) == 5: haiku.append(line3) break else: print "Your line should contain exactly 5 syllables!" os.system('clear') print "Here is your haiku:\n" print '\n' ''.join(haiku) do_over = raw_input( "\nPress any key to make another haiku, or press 'n' to leave :'( ") if do_over == 'n': quit() else: haiku_friend()
def make_haiku(model, artist_name): haiku_scheme = [5, 7, 5] char_limit = 140 - len(artist_name) - 3 haiku_poem = [artist_name + ' - '] for h in haiku_scheme: while True: sentence = model.make_short_sentence(char_limit, max_overlap_total=3) if sentence: syllables = ceil(textstat.syllable_count(sentence)) if syllables == h: haiku_poem.append(sentence) break haiku = '\n'.join(haiku_poem) return haiku
def get_syll_stats(segments, feats_dict): """ Computes statistics related to number of syllables present in each word in transcript. :param segments: list of segments, where each segment is a list of words :param feats_dict: dictionary to store computed feature values """ syll_count_list = [] for segment in segments: for word in segment: syll_count_list.append(textstat.syllable_count(word)) feats_dict['syll_mean'] = np.mean(syll_count_list) if syll_count_list else float('nan') feats_dict['syll_median'] = np.median(syll_count_list) if syll_count_list else float('nan') feats_dict['syll_stdev'] = np.std(syll_count_list) if syll_count_list else float('nan') feats_dict['syll_min'] = min(syll_count_list) if syll_count_list else float('nan') feats_dict['syll_max'] = max(syll_count_list) if syll_count_list else float('nan')
def averageSyllable(text): count = 0 no_of_words = 0 tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(text) no_of_words = len(words) for word in words: count += textstat.syllable_count(word) if no_of_words != 0: return float(count) / no_of_words else: return 0.5
def gettingFeatures(text): text = text.lower() #words / syllables / sentences count wordCount = len(text.split()) syllables = textstat.syllable_count(text) sentences = textstat.sentence_count(text) try: #ReadabilityScore readabilityScore = 206.835 - 1.015 * (wordCount / sentences) - 84.6 * ( syllables / wordCount) #ReadabilityGrade ReadabilityGrade = 0.39 * (wordCount / sentences) + 11.8 * ( syllables / wordCount) - 15.59 except: readabilityScore = 0 ReadabilityGrade = 0 print(readabilityScore, ReadabilityGrade) #Direction Count #private String[] direction = {"here", "there", "over there", "beyond", "nearly", "opposite", "under", "above", "to the left", "to the right", "in the distance"}; DiractionCount = 0 DiractionCount = text.count("here") + text.count("there") + text.count( "over there") + text.count("beyond") + text.count( "nearly") + text.count("opposite") + text.count( "under") + text.count("to the left") + text.count( "to the right") + text.count("in the distance") #Exemplify count #private String[] exemplify = {"chiefly", "especially", "for instance", "in particular", "markedly", "namely", "particularly", "including", "specifically", "such as"}; Exemplify = 0 Exemplify = text.count("chiefly") + text.count("especially") + text.count( "for instance") + text.count("in particular") + text.count( "markedly") + text.count("namely") + text.count( "particularly") + text.count("incluiding") + text.count( "specifically") + text.count("such as") try: #words per sentence (average) WPS = 0 parts = [len(l.split()) for l in re.split(r'[?!.]', text) if l.strip()] WPS = sum(parts) / len(parts) #number of words per sentence except: WPS = 0 #print(wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify) return numpy.array([ wordCount, readabilityScore, ReadabilityGrade, DiractionCount, WPS, Exemplify ])
def percentageOfOneSyllable(text): count = 0 no_of_words = 0 tokenizer = RegexpTokenizer(r'\w+') words = tokenizer.tokenize(text) no_of_words = len(words) for word in words: if (textstat.syllable_count(word) == 1): count += 1 if no_of_words != 0: return float(count * 100) / no_of_words else: return 0.5
def hodorify_syllables(word): new_word = '' index = 0 #count syllables in each word syl_count = int(textstat.syllable_count(word)) #get initial and final punctuation word_start = 0 for word_start in range(0, len(word)): if word[word_start].isalpha(): break word_end = len(word) for word_end in range(len(word), 1, -1): if word[word_end - 1].isalpha(): break #split word from punctutation pre_text = word[:word_start] post_text = word[word_end:] word = word[word_start:word_end] #for each syllable of the word, print a ho-dor syllable for syl_index in range(0, syl_count): hodex = syl_index % (len(hodorian_syllables) - 1) dordex = 0 for dordex in range(0, len(hodorian_syllables[0])): if word[index + dordex].isupper(): new_word += hodorian_syllables[hodex][dordex].upper() else: new_word += hodorian_syllables[hodex][dordex].lower() index += len(hodorian_syllables[0]) #only print the final r at the end of the word if hodex == 1: if word[index].isupper(): new_word += hodorian_syllables[2].upper() else: new_word += hodorian_syllables[2] return (pre_text + new_word + post_text)
def __init__(self, text): self.sent_count = 0 # no of sentences self.word_count = 0 # no of words self.char_count = 0 # no of chars, no space self.syll_count = 0 # no of syllables self.comp_count = 0 # no of words which have three or more than three syllables # 0. loop text, add space after period if there is not text_list = text.split() text_list_len = len(text_list) for i in range(text_list_len): if "." in text_list[i]: text_list[i] = text_list[i].replace(".", ". ") text = " ".join(text_list) # 1. parse text into separate sentences based on punc from nltk.tokenize import sent_tokenize # from 'Models' tab and select 'punkt' sentences = sent_tokenize(text.decode("utf8")) # 2. cal total sentences self.sent_count = len(sentences) # 3. remove pun import string table = string.maketrans("", "") punc_removed_text = text.translate(table, string.punctuation) # 6. cal total digits self.char_count = len(punc_removed_text) - punc_removed_text.count(" ") # 4. split text into a list of separate word text_list = punc_removed_text.split() # 5. cal total words self.word_count = len(text_list) # 7. cal total syllables # 8. cal complex words from textstat.textstat import textstat for i in text_list: each_syll = textstat.syllable_count(i.decode("utf8")) if each_syll == 0: each_syll = 1 self.syll_count += each_syll if each_syll >= 3: self.comp_count += 1
def main() : for arg in sys.argv[1:]: with open(arg) as f: text = f.read() with open(arg + '.readability.snip','w') as f: f.write ("syllable_count : %s\n" % textstat.syllable_count(text)) f.write ("lexicon_count : %s\n" % textstat.lexicon_count(text)) f.write ("sentence_count : %s\n" % textstat.sentence_count(text)) f.write ("difficult_words : %s\n" % textstat.difficult_words(text)) f.write ("flesch_reading_ease : %s\n" % textstat.flesch_reading_ease(text)) f.write ("flesch_kincaid_grade : %s\n" % textstat.flesch_kincaid_grade(text)) f.write ("smog_index : %s\n" % textstat.smog_index(text)) f.write ("automated_readability_index : %s\n" % textstat.automated_readability_index(text)) f.write ("coleman_liau_index : %s\n" % textstat.coleman_liau_index(text)) f.write ("linsear_write_formula : %s\n" % textstat.linsear_write_formula(text)) f.write ("dale_chall_readability_score : %s\n" % textstat.dale_chall_readability_score(text))
def _get_base_textstats(no_code_text): """ Find basic text statistics :param no_code_text: Text we are analyzing :return: list: List of results """ results = [] group_by = 'Basic Text Statistics' num_chars = len(no_code_text) num_lower = sum(1 for c in no_code_text if c.islower()) num_upper = sum(1 for c in no_code_text if c.isupper()) num_letters = sum(1 for c in no_code_text if c.isalpha()) num_numbers = sum(1 for c in no_code_text if c.isdigit()) num_alphanum = sum(1 for c in no_code_text if c.isalnum()) num_otherchars = num_chars - num_alphanum results.append(TextFeature('Number of characters', num_chars, group_by)) results.append(TextFeature('Number of letters', num_letters, group_by)) results.append(TextFeature('Number of numbers', num_numbers, group_by)) results.append( TextFeature('Number of other characters', num_otherchars, group_by)) character_counts = Counter(no_code_text.lower()) for c in sorted(character_counts.items()): try: results.append( TextFeature( 'Character count for "{}"'.format( c[0].encode('unicode_escape')), c[1], group_by)) except AttributeError: results.append( TextFeature('Character count for "{}"'.format(c[0]), c[1], group_by)) results.append( TextFeature('Number of syllables', textstat.syllable_count(no_code_text), group_by)) results.append( TextFeature('Lexicon Count (without punctuation)', textstat.lexicon_count(no_code_text, True), group_by)) results.append( TextFeature('Lexicon Count (with punctuation)', textstat.lexicon_count(no_code_text, False), group_by)) results.append( TextFeature('Number of lower case characters', num_lower, group_by)) results.append( TextFeature('Number of upper case characters', num_upper, group_by)) return results
def hodorify_syllables(word): new_word = '' index = 0 #count syllables in each word syl_count = int(textstat.syllable_count(word)) #get initial and final punctuation word_start = 0 for word_start in range(0,len(word)): if word[word_start].isalpha(): break word_end = len(word) for word_end in range(len(word),1,-1): if word[word_end-1].isalpha(): break #split word from punctutation pre_text = word[:word_start] post_text = word[word_end:] word = word[word_start:word_end] #for each syllable of the word, print a ho-dor syllable for syl_index in range(0,syl_count): hodex = syl_index % (len(hodorian_syllables)-1) dordex = 0 for dordex in range(0,len(hodorian_syllables[0])): if word[index+dordex].isupper(): new_word += hodorian_syllables[hodex][dordex].upper() else: new_word += hodorian_syllables[hodex][dordex].lower() index += len(hodorian_syllables[0]) #only print the final r at the end of the word if hodex == 1: if word[index].isupper(): new_word += hodorian_syllables[2].upper() else: new_word += hodorian_syllables[2] return(pre_text+new_word+post_text)
def _words_in_chars(chars, method): text = "".join(c for c, _ in chars) if method == "ncra": # The NCRA defines a "word" to be 1.4 syllables, which is the average # number of syllables per English word. syllables_per_word = 1.4 # For some reason, textstat returns syllable counts such as a # one-syllable word like "the" being 0.9 syllables. syllables_in_text = textstat.syllable_count(text) / 0.9 return syllables_in_text * (1 / syllables_per_word) elif method == "traditional": # Formal definition; see https://en.wikipedia.org/wiki/Words_per_minute return len(text) / 5 elif method == "spaces": return len([i for i in text.split() if i]) else: assert False, "bad wpm method: " + method
def scores_cal_ori(text): char_count_value=textstat.char_count(text,ignore_spaces=True) lexicon_count_value=textstat.lexicon_count(text,removepunct=True) syllable_count_value=textstat.syllable_count(text) sentence_count_value=textstat.sentence_count(text) avg_sentence_length_value=textstat.avg_sentence_length(text) avg_syllables_per_word_value=textstat.avg_syllables_per_word(text) avg_letter_per_word_value=textstat.avg_letter_per_word(text) avg_sentence_per_word_value=textstat.avg_sentence_per_word(text) flesch_kincaid_grade_value=textstat.flesch_kincaid_grade(text) smog_index_value=textstat.smog_index(text) gunning_fog_value=textstat.gunning_fog(text) difficult_words_value=textstat.difficult_words(text) dale_chall_value=textstat.dale_chall_readability_score(text) polysyllab_value=textstat.polysyllabcount(text) return char_count_value,lexicon_count_value,syllable_count_value,sentence_count_value,avg_sentence_length_value,avg_syllables_per_word_value,avg_letter_per_word_value,avg_sentence_per_word_value,flesch_kincaid_grade_value,smog_index_value,gunning_fog_value,difficult_words_value,dale_chall_value,polysyllab_value return smog_index_value
def nsyllables(numsyls, tags): """ Finds an n-syllable phrase in a dict with keys for number of syllables, with values of lists of applicable words, eg: {'5': ['polysyllabic', 'proletariat'], '4': ['polyganol', 'pollywantsa'], '3': ['Pauly Shore', 'polishing'], '2': ['poly', 'goner'] '1': ['Paul', 'pole', 'Pawn'] } """ if numsyls <= 0: return [randomonesyl(), tags] if numsyls in tags and len(tags[numsyls]) >= 1: return [tags[numsyls].pop(0), tags] part = nsyllables(numsyls - 1, tags)[0] remainder = nsyllables(numsyls - int(textstat.syllable_count(part)) - 1, tags)[0] return [str(part) + " " + str(remainder), tags]
def analyse_json(json_text): # consider moving this to be a feature of Transcript in the other module df_witnesses = pd.DataFrame(columns=['html_file_location', 'witness_name', 'syllable_count','lexicon_count', 'sentence_count', 'syllables_per_word', 'gunning_fog', 'smog_index', 'text_standard'], index=[]) trscrpt = json.loads(json_text) if 'witnesses' in trscrpt: witnesses = trscrpt['witnesses'] for s in trscrpt['all_sections']: if 'speaker' in s and 'person' in s['speaker'] and \ s['speaker']['person']['speaker_type']=='witness': witness = witnesses[s['speaker']['person']['name']] witness.setdefault('all_text', []).append(s['spoken_text']) for i, p in enumerate(witnesses): if 'all_text' in witnesses[p]: witness_text = '\n\n'.join(witnesses[p]['all_text']) if len(witness_text) > 0: stats_data = {'html_file_location': trscrpt['html_file_location'], 'witness_name': p, 'syllable_count': textstat.syllable_count(witness_text), 'lexicon_count': textstat.lexicon_count(witness_text), 'sentence_count': textstat.sentence_count(witness_text), 'syllables_per_word': textstat.avg_syllables_per_word(witness_text), 'gunning_fog': textstat.gunning_fog(witness_text), 'smog_index': textstat.smog_index(witness_text), 'text_standard': textstat.text_standard(witness_text)} df_witnesses.loc['witness_%i' % i] = stats_data else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p else: df_witnesses.loc['witness_%i' % i, 'html_file_location'] = trscrpt['html_file_location'] df_witnesses.loc['witness_%i' % i, 'witness_name'] = p return df_witnesses
def analyseText(): values = request.get_json() required = [ 'inputText' ] if not all(k in values for k in required): return 'Missing values', 400 text = values['inputText'] result = { 'syllable_count': textstat.syllable_count(text), 'lexicon_count': textstat.lexicon_count(text), 'sentence_count': textstat.sentence_count(text), 'flesch_reading_ease': textstat.flesch_reading_ease(text), 'flesch_kincaid_grade': textstat.flesch_kincaid_grade(text), 'gunning_fog': textstat.gunning_fog(text), 'smog_index': textstat.smog_index(text), 'automated_readability_index': textstat.automated_readability_index(text), 'coleman_liau_index': textstat.coleman_liau_index(text), 'linsear_write_formula': textstat.linsear_write_formula(text), 'dale_chall_readability_score': textstat.dale_chall_readability_score(text) }; return jsonify(result), 200
def feature_readability(essay): syllable_count = textstat.syllable_count(essay) #音节数统计 flesch_reading_ease = textstat.flesch_reading_ease(essay) #文档的易读性0-100之间的分数 smog_index = textstat.smog_index(essay) #烟雾指数,反映文档的易读程度,更精确,更容易计算 flesch_kincaid_index = textstat.flesch_kincaid_grade(essay) #等级分数,年级等级 coleman_liau_index = textstat.coleman_liau_index(essay) #返回文本的年级级别 automated_readability_index = textstat.automated_readability_index(essay) #自动可读性指数,接近理解文本需要的年级 dale_chall_readability_score = textstat.dale_chall_readability_score(essay) #返回年级级别,使用最常见的英文单词 difficult_words = textstat.difficult_words(essay) linsear_write_formula = textstat.linsear_write_formula(essay) #返回文本的年级级别 gunning_fog = textstat.gunning_fog(essay) #迷雾指数, 反映文本的阅读难度 return syllable_count, flesch_reading_ease, smog_index, flesch_kincaid_index, coleman_liau_index, automated_readability_index, dale_chall_readability_score, difficult_words, linsear_write_formula, gunning_fog
def text_stats(corpus): tk = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize toks = [tk(entry) for entry in corpus] # get function words with open('function_words.txt') as file: funcs = file.read().split(',') funcs = [f.strip() for f in funcs] amb = ambiguity(toks) # calculate ambiguity matrix = [["Chars/Word", "Lexical Diversity", "Lexical Density", "Function Words", "Syllables", "ARI"]] for tokens, sentence in zip(toks, corpus): unique = set(tokens) avchar = 0 lexdiv = 0 lexden = 0 nfunc = 0 numsyl = 0 ari = 0 if len(sentence) > 1: lexdiv = len(unique) / len(tokens) # Lexical Diversity lexden = len([x for x in tokens if x not in funcs]) / len(tokens) # Lexical Density numsyl = textstat.syllable_count(sentence) / len(tokens) / 10 # Number of syllables in text # may be a bit dodgy without punctuation ari = abs(textstat.automated_readability_index(sentence)) / 14 # Automated Readability index for t in tokens: avchar += len(t) / len(tokens) / len(sentence) # Average num chars if t in funcs: nfunc += 1 / len(tokens) # Number of function words matrix.append([avchar, lexdiv, lexden, nfunc, numsyl, ari]) matrix = [m + [a] for m, a in zip(matrix, amb)] return np.array(matrix)
#all={1: 'CC', 2: 'CD', 3: 'DT', 4: 'EX', 5: 'FW', 6: 'IN', 7: 'JJ', 8: 'JJR', 9: 'JJS', 10: 'LS', 11: 'MD', 12: 'NN', 13: 'NNS', 14: 'NNP', 15: 'NNPS', 16: 'PDT', 17: 'POS', 18: 'PRP', 19: 'PRP$', 20: 'RB', 21: 'RBR', 22: 'RBS', 23: 'RP', 24: 'SYM', 25: 'TO', 26: 'UH', 27: 'VB', 28: 'VBD', 29: 'VBG', 30: 'VBN', 31: 'VBP', 32: 'VBZ', 33: 'WDT', 34: 'WP', 35: 'WP$', 36: 'WRB', 'NN': 12, 'FW': 5, 'PRP': 18, 'RB': 20, 'NNS': 13, 'NNP': 14, 'PRP$': 19, 'WRB': 36, 'CC': 1, 'PDT': 16, 'VBN': 30, 'WP$': 35, 'JJS': 9, 'JJR': 8, 'SYM': 24, 'VBP': 31, 'WDT': 33, 'JJ': 7, 'VBG': 29, 'WP': 34, 'VBZ': 32, 'DT': 3, 'POS': 17, 'TO': 25, 'LS': 10, 'VB': 27, 'RBS': 22, 'RBR': 21, 'EX': 4, 'IN': 6, 'RP': 23, 'CD': 2, 'VBD': 28, 'MD': 11, 'NNPS': 15, 'UH': 26, '.':37 , 37:'.' , ':':38, 38:':','-NONE-':39,39:'-NONE-' , ',':40, 40:','} #ui=[] print("grammer for the essay's") for index in range(len(df)): p=df.essay[index] p1=nltk.word_tokenize(p.lower()) p2=nltk.pos_tag(p1) counts=Counter(tag for p1,tag in p2) print(counts) total = sum(counts.values()) print(dict((word, float(count)/total) for word,count in counts.items())) print("") print("readability/complexity") for index in range(len(df)): r=df.essay[index] print(textstat.syllable_count(r)) print(textstat.readability_consensus(r)) print("") #print(textstat.flesch_reading_ease(r)) #print(textstat.flesch_kincaid_grade(r)) """for index in range(len(df)): r=df.essay[index] for words in r.split(): words1 = [w1 for w1 in words if not w1 in stopwords.words("english")] print(words1)""" #Example print("normalizing values") ranger = interp1d([1,512],[1,10])
def get_mnemonic_syllables(mn): return sum([textstat.syllable_count(a) for a in mn.split()])
WAV_FILE = path.join(path.dirname(path.realpath(__file__)), "abc.wav") # use "test.wav" as the audio source r = sr.Recognizer() with sr.WavFile(WAV_FILE) as source: audio = r.record(source) # read the entire WAV file # recognize speech using Google Speech Recognition try: # for testing purposes, we're just using the default API key # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")` # instead of `r.recognize_google(audio)` print("Google Speech Recognition thinks you said " + r.recognize_google(audio)) words = r.recognize_google(audio).split() for word in words: print ("%%", word, textstat.syllable_count(word)) print textstat.syllable_count(r.recognize_google(audio)) except sr.UnknownValueError: print("Google Speech Recognition could not understand audio") except sr.RequestError as e: print("Could not request results from Google Speech Recognition service; {0}".format(e)) ''' # recognize speech using Wit.ai WIT_AI_KEY = "INSERT WIT.AI API KEY HERE" # Wit.ai keys are 32-character uppercase alphanumeric strings try: print("Wit.ai thinks you said " + r.recognize_wit(audio, key=WIT_AI_KEY)) except sr.UnknownValueError: print("Wit.ai could not understand audio") except sr.RequestError as e: print("Could not request results from Wit.ai service; {0}".format(e))
#!/bin/python import sys, string, os from textstat.textstat import textstat inputfile = '' test_data = "" script_name = sys.argv[0] inputfile = sys.argv[1] with open(inputfile) as myfile: test_data="".join(line.rstrip() for line in myfile) var1 = str(textstat.flesch_reading_ease(test_data)) var2 = str(textstat.smog_index(test_data)) var3 = str(textstat.flesch_kincaid_grade(test_data)) var4 = str(textstat.coleman_liau_index(test_data)) var5 = str(textstat.automated_readability_index(test_data)) var6 = str(textstat.dale_chall_readability_score(test_data)) var7 = str(textstat.difficult_words(test_data)) var8 = str(textstat.linsear_write_formula(test_data)) var9 = str(textstat.gunning_fog(test_data)) var10 = str(textstat.readability_consensus(test_data)) var11 = str(textstat.syllable_count(test_data)) var12 = str(textstat.lexicon_count(test_data, 1)) var13 = str(textstat.sentence_count(test_data)) print(var1 + ',' + var2 + ',' + var3 + ',' + var4 + ',' + var5 + ',' + var6 + ',' + var7 + ',' + var8 + ',' + var9 + ',' + var10 + ',' + var11 + ',' + var12 + ',' + var13)
FAM_scores.append(MIN_FAM) else: FAM_scores.append(float(map[words[i]][1])) if (float(map[words[i]][2]) == 0): IMG_scores.append(MIN_IMG) else: IMG_scores.append(float(map[words[i]][2])) if (float(map[words[i]][3]) == 0): CONC_scores.append(MIN_CONC) else: CONC_scores.append(float(map[words[i]][3])) if (float(map[words[i]][4]) == 0): NSYL_scores.append(textstat.syllable_count(words[i])) else: NSYL_scores.append(float(map[words[i]][4])) if (float(map[words[i]][5]) == 0): FREQ_scores.append(MIN_FREQ) else: FREQ_scores.append(float(map[words[i]][5])) if (float(map[words[i]][6]) == 0): AOA_scores.append(MAX_AOA) else: AOA_scores.append(float(map[words[i]][6])) else : FAM_scores.append(MIN_FAM)
# semicolon_count try: semicolon_count = count_semicolon(AB) except: warning_message = 1 # comma_count try: comma_count = count_comma(AB) except: warning_message = 1 # num_syllables try: num_syllables = textstat.syllable_count(AB) except: warning_message = 1 # word_count try: word_count = textstat.lexicon_count(AB) except: warning_message = 1 # avg_word_len try: avg_word_len = avg_word_length(AB) except: warning_message = 1
# Build Dataset try: cur = { "title": title, "artist": artist, "year": year, "pos": pos, "lyrics": lyrics, "tags": get_tags(artist), "sentiment": sent_analyzer.polarity_scores(lyrics_repl), "f_k_grade": ts.flesch_kincaid_grade(lyrics_repl), "flesch_index": ts.flesch_reading_ease(lyrics_repl), "fog_index": ts.gunning_fog(lyrics_repl), "difficult_words": ts.difficult_words(lyrics_repl), "num_syllables": ts.syllable_count(lyrics_repl), "num_words": ts.lexicon_count(lyrics_repl, True), "num_lines": ts.sentence_count(lyrics_repl), "num_dupes": count_dupes(lyrics) } # print cur dataset.append(cur) except Exception, e: print e except Exception, e: print "Exception occurred for " + artist + ' - ' + title print e outfile = "years/" + str(year) + '.txt' dir = os.path.dirname(outfile)
from syllabify import syllabify import count_syl as cs from textstat.textstat import textstat a = syllabify('hello') # this doesn't really work, it's for ARPANET, not English words b = cs.count_syllables('accident') # this script seems to work pretty well, but gives lower and upper bound c = textstat.syllable_count('fragmentation') # ^ works well def get_mnemonic_syllables(mn): return sum([textstat.syllable_count(a) for a in mn.split()])
from nltk.corpus import wordnet as wn from textstat.textstat import textstat from pattern.en import conjugate, PRESENT, PARTICIPLE """ Run with `python collect_squattings.py > unfiltered_squattings.txt` """ # Collect present participle (ending in 'ing') of single-syllable verbs verbs = list(wn.all_synsets('v')) squattings = [] for item in verbs: for verb in item.lemmas(): syllables = round(textstat.syllable_count(verb.name())) if syllables == 1.0: squat = verb.name().replace("_", " ") squatting = conjugate(squat, PRESENT+PARTICIPLE) print squatting.encode('utf8')
from nltk.corpus import wordnet as wn from textstat.textstat import textstat """ Run with `python collect_naked.py > unfiltered_naked.txt` """ adjectives = list(wn.all_synsets('a')) + list(wn.all_synsets('s')) nakeds = [] # Collect all two-syllable adjectives for item in adjectives: for adj in item.lemmas(): syllables = round(textstat.syllable_count(adj.name())) if syllables == 2.0: naked = adj.name().replace("_", " ") nakeds.append(naked) # Uniques only nakeds = set(nakeds) for naked in nakeds: print naked.encode('utf8')
from textstat.textstat import textstat """ Run with `python collect_hearts.py > unfiltered_hearts.txt` """ # Find a specific synset of "body" body = wn._synset_from_pos_and_offset('n',5216365) # Keep searching for part_meronyms until there are no more def collect_parts_recursive(body, collection): parts = body.part_meronyms() for part in parts: collection.append(part) collect_parts_recursive(part, collection) return collection all_parts = collect_parts_recursive(body, []) # Collect heart replacements hearts = [] for part in all_parts: heart = part.name().split('.')[0] heart = heart.replace("_", " ") syllables = round(textstat.syllable_count(heart)) if syllables == 1.0: print heart