def generate_n_syllables(self, obs_map, obs_map_r, n_syllables, states):
        words = []
        emissions = []

        if (states == []):
            curr_state = np.random.choice(list(range(self.L)), p=self.A_start)
        else:
            curr_state = states[-1]

        while (syllables.estimate(' '.join(words)) < n_syllables):
            curr_state = np.random.choice(list(range(self.L)),
                                          p=self.A[curr_state])
            states.append(curr_state)
            emission_i = np.random.choice(list(range(self.D)),
                                          p=self.O[curr_state])
            emissions.append(emission_i)
            words.append(obs_map_r[emission_i])

            syllables_estimate = syllables.estimate(' '.join(words))
            if (syllables_estimate > n_syllables
                    and syllables_estimate != n_syllables):
                # Undo last choice
                words = words[:-1]
                emissions = emissions[:-1]
                states = states[:-1]

        return ' '.join(words), states
Example #2
0
def main():

    print(syllables.estimate('estimate'))

    print(syllables.estimate('emudeceram'))

    print(len('emudeceram'))
Example #3
0
def get_features_spacy(tok):
    if tok.ent_type_ != '':
        return [
            tok.text, tok.pos_, tok.tag_,
            syllables.estimate(tok.text), tok.is_stop, tok.ent_type_, tok.dep_
        ]
    else:
        return [
            tok.text, tok.pos_, tok.tag_,
            syllables.estimate(tok.text), tok.is_stop, "UNKNOWN", tok.dep_
        ]
Example #4
0
def readability():
    m = input("Would you like to analyze FKGL or SMOG?")
    x = []
    if (m.lower() == 'fkgl'):
        for dataset in datasets:
            tweets = []
            for tweet in dataset:
                words = 0
                sentences = 0
                syl = 0
                blob = textblob.TextBlob(tweet)
                for sentence in blob.sentences:
                    sentences += 1
                    for word in sentence.words:
                        words += 1
                        syl += syllables.estimate(word)
                tweets.append((0.39 * (words / sentences) + 11.8 *
                               (syl / words)) - 15.59)
            x.append(sum(tweets) / len(tweets))
        plt.bar(y, x)
        plt.xticks(rotation=45, ha="right")
        plt.xlabel('Twitter Handle')
        plt.ylabel('Flesch-Kincaid Grade Level')
        plt.title('Flesch-Kincaid Grade Level By Twitter Handle')
        plt.show()
        prntmethod(x)
    elif (m.lower() == 'smog'):
        for dataset in datasets:
            tweets = []
            for tweet in dataset:
                sent = 0
                polysyl = 0
                blob = textblob.TextBlob(tweet)
                for sentence in blob.sentences:
                    sent += 1
                    for word in sentence.words:
                        if (syllables.estimate(word) >= 3):
                            polysyl += 1
                tweets.append((1.043 * math.sqrt(polysyl * 30 / sent)) +
                              3.1291)
            x.append(sum(tweets) / len(tweets))
        plt.bar(y, x)
        plt.xticks(rotation=45, ha="right")
        plt.xlabel('Twitter Handle')
        plt.ylabel('SMOG Level')
        plt.title('SMOG Level By Twitter Handle')
        plt.show()
        prntmethod(x)
    else:
        print(
            "Sorry, that type of analysis is not supported. Please try again.")
Example #5
0
def search(dataset):
    tweets_pol = []
    tweets_sub = []
    tweets_fkgl = []
    tweets_smog = []
    tweets_form = []
    for tweet in dataset:
        ws = 0
        sent = 0
        syl = 0
        f = 0
        c = 0
        polysyl = 0
        blob = textblob.TextBlob(tweet)
        tweets_pol.append(blob.polarity)
        tweets_sub.append(blob.subjectivity)
        for sentence in blob.sentences:
            sent += 1
            for word in sentence.words:
                syl += syllables.estimate(word)
                ws += 1
                if (syllables.estimate(word) >= 3):
                    polysyl += 1
        for word, tag in blob.tags:
            if (tag in ("NN", "JJ", "IN", "DT")):
                f += 1
            elif (tag in ("PR", "VB", "RB", "UH")):
                c += 1
        tweets_smog.append((1.043 * math.sqrt(polysyl * 30 / sent)) + 3.1291)
        tweets_fkgl.append((0.39 * (ws / sent) + 11.8 * (syl / ws)) - 15.59)
        if (f + c == 0 or f - c == 0):
            tweets_form.append(50)
        else:
            tweets_form.append(50 * (((f - c) / (f + c)) + 1))
    x = [
        sum(tweets_pol) / len(tweets_pol),
        sum(tweets_sub) / len(tweets_sub),
        sum(tweets_smog) / len(tweets_smog),
        sum(tweets_fkgl) / len(tweets_fkgl),
        sum(tweets_form) / len(tweets_form)
    ]
    print()
    print("Average polarity:", x[0])
    print("Average subjectivity:", x[1])
    print("Average Flesch-Kincaid Grade Level:", x[3])
    print("Average SMOG index:", x[2])
    print("Average Formality index:", x[4])
    print()
def syllableCountHindi(text):
    count = 0
    text = removePunctuationHindi(text)
    words = text.split()
    for i in words:
        count += syllables.estimate(i)
    return count
def w_g_4Hindi(text):
    count = 0
    words = text.split()
    for x in words:
        if (syllables.estimate(x) > 4):
            count += 1
    return count
Example #8
0
 def score(self, text: str, args: list) -> float:
     # defining appropriate FKGL score for each audience level
     # Only one argument in args, which is audience.
     audience = args[0]
     desired_scores = {
         "basic": 70.0,
         "intermediate": 55.0,
         "difficult": 40.0
     }
     """the FKGL score is computed using the following equation: 
     206.835-1.015(total words/total sentences)-84.6(total syllables/total 
     words) """
     total_sentences = len(nltk.sent_tokenize(text))
     words = nltk.word_tokenize(text)
     total_words = len(words)
     total_syllables = 0
     for word in words:
         total_syllables += syllables.estimate(word)
     fkgl_score = 206.835 - 1.015 * (
         total_words / total_sentences) - 84.6 * (total_syllables /
                                                  total_words)
     # Calculating how many multiples of 5 it is away from the desired FKGL
     deviation = (abs(fkgl_score - desired_scores[audience])) / 5
     to_subtract = deviation * 0.2
     if to_subtract > 1:
         grade = 0
     else:
         grade = 1 - to_subtract
     return grade
def test_estimate():
    EXPECTED_ACCURACY = .75
    hits = []
    misses = []

    d = cmudict.dict()
    for word in d:
        phones = d[word][0]
        cmudict_syllables = 0
        for phone in phones:
            if re.match(r"\w*[012]$", phone):
                cmudict_syllables += 1
        estimated_syllables = syllables.estimate(word)
        if cmudict_syllables == estimated_syllables:
            hits.append(word)
        else:
            misses.append(word)

    hit = len(hits)
    miss = len(misses)
    total = hit + miss
    ACCURACY = hit / total
    if (ACCURACY < EXPECTED_ACCURACY):
        raise AssertionError(
            'syllables.estimate(): Expected accuracy of {0}, got {1}.'.format(
                EXPECTED_ACCURACY, ACCURACY))
Example #10
0
def Coherence_M_5(docs_A):
    #Measure Flesch-Kincaid Grade Level
    #after removing stop words
    import syllables
    Factor_1 = 0.39
    Factor_2 = 11.8
    Factor_3 = 15.59
    Coh_M = np.array([])
    MyDoc = docs_A
    Sim_Sent_Doc = []
    for k in range(len(MyDoc)):
        doc = []
        doc = str(MyDoc[k])
        Sent_doc = sent_tokenize(doc)
        T_Sent = len(Sent_doc)  #Total Sentenses
        T_Word = np.array([])
        T_Syll = np.array([])
        tokenized_word = []
        word_tokens = []
        tokenized_word = word_tokenize(doc)
        word_tokens = [w for w in tokenized_word if w.isalpha()]
        Syllables_Word = [syllables.estimate(w) for w in word_tokens]
        T_Syll = np.append(T_Syll, sum(Syllables_Word))  #Total Syllables
        T_Word = np.append(T_Word, len(word_tokens))  #Total words
        M = Factor_1 * (sum(T_Word) / T_Sent) + Factor_2 * (
            sum(T_Syll) / sum(T_Word)) + Factor_3
        Coh_M = np.append(Coh_M, M)
    return [Coh_M]
Example #11
0
def Flesch_Kincaid(doc: str) -> int:
    """Returns the Flesch-Kincaid grade level of doc."""
    words = len(doc.split())
    sentences = 0  #We'll define as number of .s, ?s, or !s not immediately preceded by another (possibly identical) one of those (and not the first character)
    for i in range(len(doc)):
        if (doc[i] == '.' or doc[i] == '?' or doc[i] == '!') and (i != 0) and (
                doc[i - 1] != '.' and doc[i - 1] != '?' and doc[i - 1] != '!'):
            sentences += 1
    sentences = max(
        sentences, 1
    )  #every document must contain at least 1 sentence (to avoid division by 0)
    syllables = 0
    #d = cmudict.dict()
    for w in doc.split():
        '''
        #This would theoretically be more accurate, but it's too slow
        syl_list = d[w.lower()] #list of syllables in word w
        if syl_list != []:
            syllables += [len(list(y for y in x if y[-1].isdigit())) for x in syl_list][0]
        else: #switch to syl package (less accurate) if not in cmudict
            syllables += syl.estimate(w)
        '''
        syllables += syl.estimate(w)
    #taking max(words,1) so that don't divide by zero...
    return (0.39 * words / sentences) + (11.8 * syllables /
                                         max(words, 1)) - 15.59
Example #12
0
def get_rhyme_word(mc, a_rhyme, b_rhyme, syllable):
    """
    Gets a word of a certain syllable that rhymes with the a_rhyme or b_rhyme
    Calls rhyme_all_words()

    :param mc: dict
    :param a_rhyme: string
    :param b_rhyme: string
    :param syllable: int
    :return: chosen_word: list

    Examples
    --------
    >>> get_rhyme_word(markov_chain, 'the', None, 2)
    lightly
    >>> get_rhyme_word(markov_chain, None, 'people', 3)
    Seasonal

    """
    # Run rhyme_all_words()which will return rhymes of a a_rhyme or b_rhyme
    # filter out all rhymes that do not have syllable = syllable
    # Calculate the mc of the filtered rhymes and their probabilities
    # Randomly pick a rhyme based on its probability

    rhyme = a_rhyme if b_rhyme is None else b_rhyme

    chosen_word = "@"
    rhymes_prob = rhymes_all_words(rhyme, mc)
    rhymes_prob = [r for r in rhymes_prob if syllables_p.estimate(r) == syllable]
    try:
        chosen_word = random.choice(rhymes_prob)
    except:
        pass
    return chosen_word
Example #13
0
 def haiku_check(self, sentence):
     """ Check message and return formatted haiku """
     body_array = sentence.replace('  ', ' ').split(' ')
     line_1 = ''
     line_1_syllables = 0
     line_2 = ''
     line_2_syllables = 0
     line_3 = ''
     line_3_syllables = 0
     syllable_count = 0
     for word in body_array:
         word_syllables = syllables.estimate(word)
         syllable_count = syllable_count + word_syllables
         if line_1_syllables + word_syllables <= 5:
             line_1_syllables = line_1_syllables + word_syllables
             line_1 = line_1 + ' ' + word
         elif line_2_syllables + word_syllables <= 7 and line_1_syllables == 5:
             line_2_syllables = line_2_syllables + word_syllables
             line_2 = line_2 + ' ' + word
         elif line_3_syllables + word_syllables <= 5 and line_1_syllables == 5 and line_2_syllables == 7:
             line_3_syllables = line_3_syllables + word_syllables
             line_3 = line_3 + ' ' + word
     return {
         'syllable_count': syllable_count,
         'line_1_syllables': line_1_syllables,
         'line_1': line_1,
         'line_2_syllables': line_2_syllables,
         'line_2': line_2,
         'line_3_syllables': line_3_syllables,
         'line_3': line_3
     }
Example #14
0
def rhyming_word_generator(rhyming_scheme):
    for alphabet in rhyming_scheme:
        #print("current alphabet of scheme: {}".format(alphabet))
        flag = 0
        if (bool(rhyme_pattern) == True):
            for key in rhyme_pattern.keys():
                #print("key check: {}".format(key))
                if key == alphabet:
                    flag = 1
                    words = rhyme_pattern[alphabet]
                    print("To match with line type '{}' with {} syllables in total: {}"\
                            .format(key, sentence_syllables[alphabet], words))

        if flag == 0:
            words = []
            sentence = input("Enter your line: ")
            #print("sentence: {}".format(sentence))
            sentence_words = sentence.split(" ")
            #print("sentence words: {}".format(sentence_words))
            sentence_syllables[alphabet] = syl.estimate(sentence)
            print(sentence_syllables[alphabet])
            last_word = sentence_words.pop()
            #print("last words: {}".format(last_word))
            l = pronouncing.rhymes(last_word)
            #print(l)
            for i in range(4):
                words.append(random.choice(l))
            rhyme_pattern[alphabet] = words
Example #15
0
def get_n_syllables(word):
    word = inverse_replace_punctuation(word)
    word = word.lower()
    if word in cmudict_dict.keys():
        return sum(map(lambda x: x[-1].isdigit(), cmudict_dict[word][0]))
    else:
        return syllables.estimate(word)
def create_adjective_data_rows(adjectives_list):
    frequencies = get_frequencies(adjectives_list)
    probabilities = get_probabilities(adjectives_list)
    syllable_counts = [
        syllables.estimate(adjective) for adjective in adjectives_list
    ]
    return zip(adjectives_list, frequencies, probabilities, syllable_counts)
Example #17
0
def Coherence_M_4(docs_A):
    #Measure Readability
    #after removing stop words
    import syllables
    Factor_1 = 206.835
    Factor_2 = 1.015
    Factor_3 = 84.6
    Coh_M = np.array([])
    MyDoc = docs_A
    Sim_Sent_Doc = []
    for k in range(len(MyDoc)):
        doc = []
        doc = str(MyDoc[k])
        doc = str(MyDoc[k])
        doc = doc.lower()
        Sent_doc = sent_tokenize(doc)
        T_Sent = len(Sent_doc)  #Total Sentenses
        T_Word = np.array([])
        T_Syll = np.array([])
        tokenized_word = []
        word_tokens = []
        tokenized_word = word_tokenize(doc)
        word_tokens = [w for w in tokenized_word if w.isalpha()]
        Syllables_Word = [syllables.estimate(w) for w in word_tokens]
        T_Syll = np.append(T_Syll, sum(Syllables_Word))  #Total Syllables
        T_Word = np.append(T_Word, len(word_tokens))  #Total words
        M = Factor_1 - Factor_2 * (sum(T_Word) / T_Sent) - Factor_3 * (
            sum(T_Syll) / sum(T_Word))
        Coh_M = np.append(Coh_M, M)
    return [Coh_M]
Example #18
0
def Validate(lineList):
    valid = []
    for sentence in lineList:
        sylls = SY.estimate(sentence)
        if (sylls == 10):
            valid.append(sentence)

    return valid
Example #19
0
def get_average_syllables(bars):
    """
	Calculate the average number of syllables per bar
	"""
    bar_syllables = []
    for bar in bars:
        bar_syllables.append(syllables.estimate(bar))
    return np.mean(np.asarray(bar_syllables))
Example #20
0
def add_effects(subtitles):
    effected_subs = SSAFile()
    for sub in subtitles:
        content = sub.plaintext.strip().replace('\n', ' ')
        time_per_syllable = (sub.end - sub.start) / syllables.estimate(content)
        current_time = sub.start
        current_index = 0
        for word in content.split(' '):
            sylls = syllables.estimate(word)
            sub_end_time = current_time + time_per_syllable * sylls
            current_index += len(word) if current_index == 0 else len(word) + 1
            text = content[:current_index] + '{\\alpha&HFF}' + content[
                current_index:]  # adds transparency
            effected_subs.append(
                SSAEvent(start=current_time, end=sub_end_time, text=text))
            current_time = sub_end_time
    return effected_subs
Example #21
0
def estimate_syllables(haiku: str):
    """Estimate the number of syllables per each line of the given haiku."""
    lines = haiku.split("/")
    counts = []
    for line in lines:
        words = line.strip(" \t\n#").split()
        counts.append(sum(syllables.estimate(w) for w in words))
    return tuple(counts)
def s_g_20Hindi(text):
    count = 0
    num_syl = 0
    sentences = sentence_tokenize.sentence_split(text, lang='hi')
    for i in sentences:
        if (syllables.estimate(i) > 20):
            count += 1
    return count
def get_handcrafted_features(word):
    word = str(word)
    return [
        len(word),
        syllables.estimate(word),
        frequency.get(stemmer.stem(word.lower())) or 0,
        len([_ for _ in word if _.isupper()]) / len(word)
    ]
def flesch_reading_ease(text):
    n_word = len(tokenize.word_tokenize(text))
    n_sents = len(split_into_sentences(text))
    n_syl = 0
    for w in tokenize.word_tokenize(text):
        n_syl = n_syl + syllables.estimate(w)
    fre = 206.835 - 1.015 *  (0 if n_sents == 0 else (n_word / n_sents)) - 84.6 * (0 if n_word == 0 else (n_syl / n_word))
    return fre
Example #25
0
def save_other_features(data, parse_lst_path, config, path, context=True, parse=True, multi=False):
    if multi:
        if 'complexity' in data:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token','complexity','class']])
        else:
            data_head, data_tail = multi_data(data[['id','corpus','sentence','token']])
        base, ext = os.path.splitext(path)
        path_head = base+'_head'+ext
        path_tail = base+'_tail'+ext
        omit = save_other_features(data_head, parse_lst_path, config, path_head, context=context, parse=parse)
        _    = save_other_features(data_tail, parse_lst_path, config, path_tail, context=context, parse=parse)
        multi_compute(data, path, path_head, path_tail, omit)
        return
    # based on aspect word
    data['word_len'] = data['token'].str.len().to_numpy()
    data['num_syllables'] = data['token'].apply(lambda x: syllables.estimate(str(x))).to_numpy()
    data['num_hyponyms'] = data.apply(lambda x: len(get_hyponyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['num_hypernyms'] = data.apply(lambda x: len(get_hypernyms(x['sentence'] if context else None, x['token'], disambiguate=config['disambiguate'] if context else False)), axis=1).to_numpy()
    data['is_acronym'] = (data['token'].str.isupper()*1).to_numpy()
    data['is_pronoun'] = (data['token'].apply(lambda x: x[0].isupper())*1).to_numpy()
    # based on context
    omit = set()
    if context:
        corpus_dummies = pd.get_dummies(data['corpus'], prefix='corpus')
        for corpus_name in corpus_dummies:
            data[corpus_name] = corpus_dummies[corpus_name]
            omit.add(corpus_name)
        tagdict = load('help/tagsets/upenn_tagset.pickle')
        tags = [tag for tag in tagdict.keys() if tag[0] not in punctuation]
        POS = data.apply(lambda x: get_POS(x['sentence'], x['token']), axis=1)
        for tag in tags:
            data['POS_'+tag] = (POS == tag) * 1
        funcs = ["textstat." + func[0] for func in inspect.getmembers(textstat, predicate=inspect.ismethod)]
        for elem in tqdm(funcs):
            method = eval(elem)
            if method.__name__ in ['difficult_words_list', 'set_lang', 'text_standard', 'dale_chall_readability_score_v2', 'dale_chall_readability_score', 'gunning_fog', 'spache_readability', 'avg_sentence_length', 'avg_sentence_per_word', 'sentence_count', 'difficult_words', 'is_difficult_word', 'is_easy_word', 'smog_index']:
                continue
            textstat.set_lang("en")
            data[method.__name__] = data['sentence'].apply(lambda x: method(x)).to_numpy()
            omit.add(method.__name__)
        data['SMOGIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['SMOGIndex']).to_numpy()
        data['DaleChallIndex'] = data['sentence'].apply(lambda x: readability.getmeasures(x, lang='en')['readability grades']['DaleChallIndex']).to_numpy()
        omit.add('SMOGIndex'); omit.add('DaleChallIndex')
        if parse and parse_lst_path is not None:
            parse_lst = pkl.load(open(parse_lst_path, 'rb'))
            parse_tree_depths = []
            token_depths = []
            num_words_at_depths = []
            for parse_tree, token in tqdm(zip(parse_lst, data['token'])):
                parse_tree_depths.append(parse_tree.height())
                token_depths.append(token_depth(parse_tree, token))
                num_words_at_depths.append(num_words_at_depth(parse_tree, token_depths[-1]))
            data['parse_tree_depth'] = np.array(parse_tree_depths).astype(np.int64)
            omit.add('parse_tree_depth')
            data['token_depth'] = np.array(token_depths).astype(np.int64)
            data['num_words_at_depth'] = np.array(num_words_at_depths).astype(np.int64)
    data.to_csv(path, sep='\t')
    return omit
def singleSyllableWordHindi(text):
    count = 0
    num_syl = 0
    text = removePunctuationHindi(text)
    words = text.split()
    for i in words:
        num_syl = syllables.estimate(i)
        if (num_syl == 1):
            count += 1
    return count
def polysyllabicHindi(text):
    count = 0
    num_syl = 0
    text = removePunctuationHindi(text)
    words = text.split()
    for i in words:
        num_syl = syllables.estimate(i)
        if (num_syl >= 2):
            count += 1
    return count
Example #28
0
    def analyzeScript(self, videoId):
        captions = self.getCaptions(videoId)
        # Caption info
        info = dict()
        info['videoId'] = videoId
        info['script'] = ' '.join([caption['text'] for caption in captions
                                   ])  # whole script text
        info['avgSyllPerSec'] = syllables.estimate(info['script']) / sum(
            [caption['duration'] for caption in captions])

        for caption in captions:  # Get tokens, words per second, syllables per second for each captions
            caption['token'] = nltk.word_tokenize(caption['text'])
            caption['wordPerSec'] = len(caption['token']) / caption['duration']
            caption['syllPerSec'] = syllables.estimate(
                caption['text']) / caption['duration']
            info[caption['start']] = caption
            del caption['start']

        return json.dumps(info, indent=4)
def word_dictionary_item(x):
    word = x.text
    tag = x.tag_
    return {
        'WORD': word,
        'POLAR':
        SentimentIntensityAnalyzer().polarity_scores(word)['compound'],
        'READ': syllables.estimate(word),
        'TAG': tag
    }
Example #30
0
def no_syllables(word):
    """Get no syllables in word"""
    pronunciations = CMUDICT.get(word)
    if pronunciations is not None:
        sylls = [
            len(list(y for y in x if y[-1].isdigit())) for x in pronunciations
        ]
        return max(sylls)

    return syllables.estimate(word)