def generate_n_syllables(self, obs_map, obs_map_r, n_syllables, states): words = [] emissions = [] if (states == []): curr_state = np.random.choice(list(range(self.L)), p=self.A_start) else: curr_state = states[-1] while (syllables.estimate(' '.join(words)) < n_syllables): curr_state = np.random.choice(list(range(self.L)), p=self.A[curr_state]) states.append(curr_state) emission_i = np.random.choice(list(range(self.D)), p=self.O[curr_state]) emissions.append(emission_i) words.append(obs_map_r[emission_i]) syllables_estimate = syllables.estimate(' '.join(words)) if (syllables_estimate > n_syllables and syllables_estimate != n_syllables): # Undo last choice words = words[:-1] emissions = emissions[:-1] states = states[:-1] return ' '.join(words), states
def main(): print(syllables.estimate('estimate')) print(syllables.estimate('emudeceram')) print(len('emudeceram'))
def get_features_spacy(tok): if tok.ent_type_ != '': return [ tok.text, tok.pos_, tok.tag_, syllables.estimate(tok.text), tok.is_stop, tok.ent_type_, tok.dep_ ] else: return [ tok.text, tok.pos_, tok.tag_, syllables.estimate(tok.text), tok.is_stop, "UNKNOWN", tok.dep_ ]
def readability(): m = input("Would you like to analyze FKGL or SMOG?") x = [] if (m.lower() == 'fkgl'): for dataset in datasets: tweets = [] for tweet in dataset: words = 0 sentences = 0 syl = 0 blob = textblob.TextBlob(tweet) for sentence in blob.sentences: sentences += 1 for word in sentence.words: words += 1 syl += syllables.estimate(word) tweets.append((0.39 * (words / sentences) + 11.8 * (syl / words)) - 15.59) x.append(sum(tweets) / len(tweets)) plt.bar(y, x) plt.xticks(rotation=45, ha="right") plt.xlabel('Twitter Handle') plt.ylabel('Flesch-Kincaid Grade Level') plt.title('Flesch-Kincaid Grade Level By Twitter Handle') plt.show() prntmethod(x) elif (m.lower() == 'smog'): for dataset in datasets: tweets = [] for tweet in dataset: sent = 0 polysyl = 0 blob = textblob.TextBlob(tweet) for sentence in blob.sentences: sent += 1 for word in sentence.words: if (syllables.estimate(word) >= 3): polysyl += 1 tweets.append((1.043 * math.sqrt(polysyl * 30 / sent)) + 3.1291) x.append(sum(tweets) / len(tweets)) plt.bar(y, x) plt.xticks(rotation=45, ha="right") plt.xlabel('Twitter Handle') plt.ylabel('SMOG Level') plt.title('SMOG Level By Twitter Handle') plt.show() prntmethod(x) else: print( "Sorry, that type of analysis is not supported. Please try again.")
def search(dataset): tweets_pol = [] tweets_sub = [] tweets_fkgl = [] tweets_smog = [] tweets_form = [] for tweet in dataset: ws = 0 sent = 0 syl = 0 f = 0 c = 0 polysyl = 0 blob = textblob.TextBlob(tweet) tweets_pol.append(blob.polarity) tweets_sub.append(blob.subjectivity) for sentence in blob.sentences: sent += 1 for word in sentence.words: syl += syllables.estimate(word) ws += 1 if (syllables.estimate(word) >= 3): polysyl += 1 for word, tag in blob.tags: if (tag in ("NN", "JJ", "IN", "DT")): f += 1 elif (tag in ("PR", "VB", "RB", "UH")): c += 1 tweets_smog.append((1.043 * math.sqrt(polysyl * 30 / sent)) + 3.1291) tweets_fkgl.append((0.39 * (ws / sent) + 11.8 * (syl / ws)) - 15.59) if (f + c == 0 or f - c == 0): tweets_form.append(50) else: tweets_form.append(50 * (((f - c) / (f + c)) + 1)) x = [ sum(tweets_pol) / len(tweets_pol), sum(tweets_sub) / len(tweets_sub), sum(tweets_smog) / len(tweets_smog), sum(tweets_fkgl) / len(tweets_fkgl), sum(tweets_form) / len(tweets_form) ] print() print("Average polarity:", x[0]) print("Average subjectivity:", x[1]) print("Average Flesch-Kincaid Grade Level:", x[3]) print("Average SMOG index:", x[2]) print("Average Formality index:", x[4]) print()
def syllableCountHindi(text): count = 0 text = removePunctuationHindi(text) words = text.split() for i in words: count += syllables.estimate(i) return count
def w_g_4Hindi(text): count = 0 words = text.split() for x in words: if (syllables.estimate(x) > 4): count += 1 return count
def score(self, text: str, args: list) -> float: # defining appropriate FKGL score for each audience level # Only one argument in args, which is audience. audience = args[0] desired_scores = { "basic": 70.0, "intermediate": 55.0, "difficult": 40.0 } """the FKGL score is computed using the following equation: 206.835-1.015(total words/total sentences)-84.6(total syllables/total words) """ total_sentences = len(nltk.sent_tokenize(text)) words = nltk.word_tokenize(text) total_words = len(words) total_syllables = 0 for word in words: total_syllables += syllables.estimate(word) fkgl_score = 206.835 - 1.015 * ( total_words / total_sentences) - 84.6 * (total_syllables / total_words) # Calculating how many multiples of 5 it is away from the desired FKGL deviation = (abs(fkgl_score - desired_scores[audience])) / 5 to_subtract = deviation * 0.2 if to_subtract > 1: grade = 0 else: grade = 1 - to_subtract return grade
def test_estimate(): EXPECTED_ACCURACY = .75 hits = [] misses = [] d = cmudict.dict() for word in d: phones = d[word][0] cmudict_syllables = 0 for phone in phones: if re.match(r"\w*[012]$", phone): cmudict_syllables += 1 estimated_syllables = syllables.estimate(word) if cmudict_syllables == estimated_syllables: hits.append(word) else: misses.append(word) hit = len(hits) miss = len(misses) total = hit + miss ACCURACY = hit / total if (ACCURACY < EXPECTED_ACCURACY): raise AssertionError( 'syllables.estimate(): Expected accuracy of {0}, got {1}.'.format( EXPECTED_ACCURACY, ACCURACY))
def Coherence_M_5(docs_A): #Measure Flesch-Kincaid Grade Level #after removing stop words import syllables Factor_1 = 0.39 Factor_2 = 11.8 Factor_3 = 15.59 Coh_M = np.array([]) MyDoc = docs_A Sim_Sent_Doc = [] for k in range(len(MyDoc)): doc = [] doc = str(MyDoc[k]) Sent_doc = sent_tokenize(doc) T_Sent = len(Sent_doc) #Total Sentenses T_Word = np.array([]) T_Syll = np.array([]) tokenized_word = [] word_tokens = [] tokenized_word = word_tokenize(doc) word_tokens = [w for w in tokenized_word if w.isalpha()] Syllables_Word = [syllables.estimate(w) for w in word_tokens] T_Syll = np.append(T_Syll, sum(Syllables_Word)) #Total Syllables T_Word = np.append(T_Word, len(word_tokens)) #Total words M = Factor_1 * (sum(T_Word) / T_Sent) + Factor_2 * ( sum(T_Syll) / sum(T_Word)) + Factor_3 Coh_M = np.append(Coh_M, M) return [Coh_M]
def Flesch_Kincaid(doc: str) -> int: """Returns the Flesch-Kincaid grade level of doc.""" words = len(doc.split()) sentences = 0 #We'll define as number of .s, ?s, or !s not immediately preceded by another (possibly identical) one of those (and not the first character) for i in range(len(doc)): if (doc[i] == '.' or doc[i] == '?' or doc[i] == '!') and (i != 0) and ( doc[i - 1] != '.' and doc[i - 1] != '?' and doc[i - 1] != '!'): sentences += 1 sentences = max( sentences, 1 ) #every document must contain at least 1 sentence (to avoid division by 0) syllables = 0 #d = cmudict.dict() for w in doc.split(): ''' #This would theoretically be more accurate, but it's too slow syl_list = d[w.lower()] #list of syllables in word w if syl_list != []: syllables += [len(list(y for y in x if y[-1].isdigit())) for x in syl_list][0] else: #switch to syl package (less accurate) if not in cmudict syllables += syl.estimate(w) ''' syllables += syl.estimate(w) #taking max(words,1) so that don't divide by zero... return (0.39 * words / sentences) + (11.8 * syllables / max(words, 1)) - 15.59
def get_rhyme_word(mc, a_rhyme, b_rhyme, syllable): """ Gets a word of a certain syllable that rhymes with the a_rhyme or b_rhyme Calls rhyme_all_words() :param mc: dict :param a_rhyme: string :param b_rhyme: string :param syllable: int :return: chosen_word: list Examples -------- >>> get_rhyme_word(markov_chain, 'the', None, 2) lightly >>> get_rhyme_word(markov_chain, None, 'people', 3) Seasonal """ # Run rhyme_all_words()which will return rhymes of a a_rhyme or b_rhyme # filter out all rhymes that do not have syllable = syllable # Calculate the mc of the filtered rhymes and their probabilities # Randomly pick a rhyme based on its probability rhyme = a_rhyme if b_rhyme is None else b_rhyme chosen_word = "@" rhymes_prob = rhymes_all_words(rhyme, mc) rhymes_prob = [r for r in rhymes_prob if syllables_p.estimate(r) == syllable] try: chosen_word = random.choice(rhymes_prob) except: pass return chosen_word
def haiku_check(self, sentence): """ Check message and return formatted haiku """ body_array = sentence.replace(' ', ' ').split(' ') line_1 = '' line_1_syllables = 0 line_2 = '' line_2_syllables = 0 line_3 = '' line_3_syllables = 0 syllable_count = 0 for word in body_array: word_syllables = syllables.estimate(word) syllable_count = syllable_count + word_syllables if line_1_syllables + word_syllables <= 5: line_1_syllables = line_1_syllables + word_syllables line_1 = line_1 + ' ' + word elif line_2_syllables + word_syllables <= 7 and line_1_syllables == 5: line_2_syllables = line_2_syllables + word_syllables line_2 = line_2 + ' ' + word elif line_3_syllables + word_syllables <= 5 and line_1_syllables == 5 and line_2_syllables == 7: line_3_syllables = line_3_syllables + word_syllables line_3 = line_3 + ' ' + word return { 'syllable_count': syllable_count, 'line_1_syllables': line_1_syllables, 'line_1': line_1, 'line_2_syllables': line_2_syllables, 'line_2': line_2, 'line_3_syllables': line_3_syllables, 'line_3': line_3 }
def rhyming_word_generator(rhyming_scheme): for alphabet in rhyming_scheme: #print("current alphabet of scheme: {}".format(alphabet)) flag = 0 if (bool(rhyme_pattern) == True): for key in rhyme_pattern.keys(): #print("key check: {}".format(key)) if key == alphabet: flag = 1 words = rhyme_pattern[alphabet] print("To match with line type '{}' with {} syllables in total: {}"\ .format(key, sentence_syllables[alphabet], words)) if flag == 0: words = [] sentence = input("Enter your line: ") #print("sentence: {}".format(sentence)) sentence_words = sentence.split(" ") #print("sentence words: {}".format(sentence_words)) sentence_syllables[alphabet] = syl.estimate(sentence) print(sentence_syllables[alphabet]) last_word = sentence_words.pop() #print("last words: {}".format(last_word)) l = pronouncing.rhymes(last_word) #print(l) for i in range(4): words.append(random.choice(l)) rhyme_pattern[alphabet] = words
def get_n_syllables(word): word = inverse_replace_punctuation(word) word = word.lower() if word in cmudict_dict.keys(): return sum(map(lambda x: x[-1].isdigit(), cmudict_dict[word][0])) else: return syllables.estimate(word)
def create_adjective_data_rows(adjectives_list): frequencies = get_frequencies(adjectives_list) probabilities = get_probabilities(adjectives_list) syllable_counts = [ syllables.estimate(adjective) for adjective in adjectives_list ] return zip(adjectives_list, frequencies, probabilities, syllable_counts)
def Coherence_M_4(docs_A): #Measure Readability #after removing stop words import syllables Factor_1 = 206.835 Factor_2 = 1.015 Factor_3 = 84.6 Coh_M = np.array([]) MyDoc = docs_A Sim_Sent_Doc = [] for k in range(len(MyDoc)): doc = [] doc = str(MyDoc[k]) doc = str(MyDoc[k]) doc = doc.lower() Sent_doc = sent_tokenize(doc) T_Sent = len(Sent_doc) #Total Sentenses T_Word = np.array([]) T_Syll = np.array([]) tokenized_word = [] word_tokens = [] tokenized_word = word_tokenize(doc) word_tokens = [w for w in tokenized_word if w.isalpha()] Syllables_Word = [syllables.estimate(w) for w in word_tokens] T_Syll = np.append(T_Syll, sum(Syllables_Word)) #Total Syllables T_Word = np.append(T_Word, len(word_tokens)) #Total words M = Factor_1 - Factor_2 * (sum(T_Word) / T_Sent) - Factor_3 * ( sum(T_Syll) / sum(T_Word)) Coh_M = np.append(Coh_M, M) return [Coh_M]
def Validate(lineList): valid = [] for sentence in lineList: sylls = SY.estimate(sentence) if (sylls == 10): valid.append(sentence) return valid
def get_average_syllables(bars): """ Calculate the average number of syllables per bar """ bar_syllables = [] for bar in bars: bar_syllables.append(syllables.estimate(bar)) return np.mean(np.asarray(bar_syllables))
def add_effects(subtitles): effected_subs = SSAFile() for sub in subtitles: content = sub.plaintext.strip().replace('\n', ' ') time_per_syllable = (sub.end - sub.start) / syllables.estimate(content) current_time = sub.start current_index = 0 for word in content.split(' '): sylls = syllables.estimate(word) sub_end_time = current_time + time_per_syllable * sylls current_index += len(word) if current_index == 0 else len(word) + 1 text = content[:current_index] + '{\\alpha&HFF}' + content[ current_index:] # adds transparency effected_subs.append( SSAEvent(start=current_time, end=sub_end_time, text=text)) current_time = sub_end_time return effected_subs
def estimate_syllables(haiku: str): """Estimate the number of syllables per each line of the given haiku.""" lines = haiku.split("/") counts = [] for line in lines: words = line.strip(" \t\n#").split() counts.append(sum(syllables.estimate(w) for w in words)) return tuple(counts)
def s_g_20Hindi(text): count = 0 num_syl = 0 sentences = sentence_tokenize.sentence_split(text, lang='hi') for i in sentences: if (syllables.estimate(i) > 20): count += 1 return count
def get_handcrafted_features(word): word = str(word) return [ len(word), syllables.estimate(word), frequency.get(stemmer.stem(word.lower())) or 0, len([_ for _ in word if _.isupper()]) / len(word) ]
def flesch_reading_ease(text): n_word = len(tokenize.word_tokenize(text)) n_sents = len(split_into_sentences(text)) n_syl = 0 for w in tokenize.word_tokenize(text): n_syl = n_syl + syllables.estimate(w) fre = 206.835 - 1.015 * (0 if n_sents == 0 else (n_word / n_sents)) - 84.6 * (0 if n_word == 0 else (n_syl / n_word)) return fre
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False): if multi: if 'complexity' in data: data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']]) else: data_head, data_tail = multi_data(data[['id','corpus','sentence','token']]) base, ext = os.path.splitext(path) path_head = base+'_head'+ext path_tail = base+'_tail'+ext omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse) _ = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse) multi_compute(data, path, path_head, path_tail, omit) return # based on aspect word data['word_len'] = data['token'].str.len().to_numpy() data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy() data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy() data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy() data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy() # based on context omit = set() if context: corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus') for corpus_name in corpus_dummies: data[corpus_name] = corpus_dummies[corpus_name] omit.add(corpus_name) tagdict = load('help/tagsets/upenn_tagset.pickle') tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation] POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1) for tag in tags: data['POS_'+tag] = (POS == tag) * 1 funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)] for elem in tqdm(funcs): method = eval(elem) if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']: continue textstat.set_lang("en") data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy() omit.add(method.__name__) data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy() data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy() omit.add('SMOGIndex'); omit.add('DaleChallIndex') if parse and parse_lst_path is not None: parse_lst = pkl.load(open(parse_lst_path, 'rb')) parse_tree_depths = [] token_depths = [] num_words_at_depths = [] for parse_tree, token in tqdm(zip(parse_lst, data['token'])): parse_tree_depths.append(parse_tree.height()) token_depths.append(token_depth(parse_tree, token)) num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1])) data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64) omit.add('parse_tree_depth') data['token_depth'] = np.array(token_depths).astype(np.int64) data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64) data.to_csv(path, sep='\t') return omit
def singleSyllableWordHindi(text): count = 0 num_syl = 0 text = removePunctuationHindi(text) words = text.split() for i in words: num_syl = syllables.estimate(i) if (num_syl == 1): count += 1 return count
def polysyllabicHindi(text): count = 0 num_syl = 0 text = removePunctuationHindi(text) words = text.split() for i in words: num_syl = syllables.estimate(i) if (num_syl >= 2): count += 1 return count
def analyzeScript(self, videoId): captions = self.getCaptions(videoId) # Caption info info = dict() info['videoId'] = videoId info['script'] = ' '.join([caption['text'] for caption in captions ]) # whole script text info['avgSyllPerSec'] = syllables.estimate(info['script']) / sum( [caption['duration'] for caption in captions]) for caption in captions: # Get tokens, words per second, syllables per second for each captions caption['token'] = nltk.word_tokenize(caption['text']) caption['wordPerSec'] = len(caption['token']) / caption['duration'] caption['syllPerSec'] = syllables.estimate( caption['text']) / caption['duration'] info[caption['start']] = caption del caption['start'] return json.dumps(info, indent=4)
def word_dictionary_item(x): word = x.text tag = x.tag_ return { 'WORD': word, 'POLAR': SentimentIntensityAnalyzer().polarity_scores(word)['compound'], 'READ': syllables.estimate(word), 'TAG': tag }
def no_syllables(word): """Get no syllables in word""" pronunciations = CMUDICT.get(word) if pronunciations is not None: sylls = [ len(list(y for y in x if y[-1].isdigit())) for x in pronunciations ] return max(sylls) return syllables.estimate(word)