Esempio n. 1
0
  def __init__(self, dataset="picasso2", basedir="parsed_data"):
    self.dataset = dataset
    self.basedir = basedir
    filename = "%s/%s/source/%s" % (basedir,dataset,dataset)
    self.debug("poemparser:init:dataset parsing '%s'..." % filename)

    with open("pyParser/english_words.txt") as word_file:
      self.english_words = set(word.strip().lower() for word in word_file)

    # Open and analyze the text data.
    self.unknownWords   = {}
    self.iffyWords      = {}
    self.allmatch       = {}
    self.alltokens      = self.openTokens(filename)
    self.parsedTokens   = [token for token in self.alltokens[0] if token != '-']
    self.replacedTokens = [token for token in self.alltokens[1] if token != '-']
    self.fullTokens     = [token for token in self.alltokens[2] if token != '-']
    self.tokens         = self.parsedTokens
    self.loweredTokens  = [token.lower() for token in self.replacedTokens]
    self.pos_tags       = nltk.pos_tag(self.replacedTokens)
    self.text           = nltk.Text(self.tokens)
    self.dict           = cmudict.dict() 
    self.lastspeed      = 0
    self.midiindex      = 0
    
    self.setMIDISettings(12)
    
    self.debug("poemparser:init:words %s"  % self.fullTokens)
    self.debug("poemparser:init:tokens %s" % self.tokens)
    self.debug("poemparser:init:text %s"   % self.text)
Esempio n. 2
0
def reset_country_codes_to_emoflags(cc_path='country_codes.txt',
        irange=ET.FLAGS_RANGE, charset='utf-8'):
    '''
    Using a country code dict, set the name and syllable fields
    in a copy of emo_tuples.
    '''
    cmu_prons = cmudict.dict() # get the CMU Pronouncing Dict
    cc_dict = load_country_codes(cc_path)

    for tup in ET.EMO_TUPLES[irange.start:irange.stop]:
        cc2 = tup[ET.INDEX_ALTERNATIVES][0].strip(':').upper()
        # print(cc2, '  ', end='')
        monos, polys, names = [], [], [cc2]
        names.extend(nm for nm in tup[ET.INDEX_POLYSYLLABLES] if len(nm) > 2)
        try:
            names.extend(cc_dict[cc2])
            # print(names, file=sys.stderr)
        except KeyError:
            print("{} missing {}\n\tusing: {}".format(
                   cc2, tup, names), file=sys.stderr)
        for name in set(names):
            if sylc.syl_count(cmu_prons, name) == 1:
                monos.append(name)
            else:
                polys.append(name)
        tupal = list(tup)
        tupal[ET.INDEX_WORDSYLLABLES] = monos
        tupal[ET.INDEX_POLYSYLLABLES] = polys
        ret = tuple(tupal)
        print("    {},".format(ret), file=sys.stdout)
        # tupal[ET.INDEX_WORDSYLLABLES] =
    print()
Esempio n. 3
0
	def stress(self,bysentence=False):
		"""
		tokenizes (I guess) the words in self.text by the stress pattern in each of the words.
		"""
		vowels = ['A','E','I','O','U']
		possible_stresses = ['1','2','0']
		totaldic = cmudict.dict()
		def gen_stress(stripped_text):
			stress_list = []
			for word in stripped_text.lower().split():
				try:
					stress = str()
					phonemized = totaldic[word][0]
					for phoneme in phonemized:
						for stresser in possible_stresses:
							if stresser in phoneme:
								stress += stresser
					for index, sound in enumerate(phonemized[len(phonemized)-2:len(phonemized)]):
						for vowel in vowels:
							if vowel in sound:
								stress_list.append([word,stress,[index, sound],phonemized,len(phonemized)])
				except KeyError:
					# print("{} couldn't be found".format(word))
					pass
			return stress_list

		if bysentence:
			sentences = PunktSentenceTokenizer().tokenize(master_str)
			stress_by_sentence = [sentence.translate(string.maketrans("",""), string.punctuation) for sentence in sentences]
			return [gen_stress(sentence) for sentence in stress_by_sentence]

		elif not bysentence:
			stress_total = self.text.translate(string.maketrans("",""), string.punctuation) 
			return gen_stress(stress_total)
Esempio n. 4
0
 def __compliant_haiku(self, haiku_source):
     """Ensure that newlines remain and all 
     other punctuation has been stripped"""
     """Ensure that newlines remain and all 
     other punctuation has been stripped"""
     dict = cmudict.dict()
     haiku_lines = haiku_source.splitlines()
     syllables = []
     for line in haiku_lines:
         if line == "":
             continue
         sal=[]
         for word in line.split(" "):
             sal.append(len([x for x in dict[word][0] if x[-1].isdigit()]))
         syllables.append(sum(sal))
     pattern = [5,7,5]
     if len(syllables) % 3 == 0:
         while len(syllables) > 0:
             if syllables[:3] == pattern:
                 for x in range(2,-1,-1):
                     syllables.pop(x)
             else:
                 return False
     else:
         return False
     return True
Esempio n. 5
0
 def __init__(self,text):
     # Initialize vars
     self.sent_count = 0
     self.word_count = 0
     self.syll_count = 0
     self.cmu = cmudict.dict()
     self.processText(text)
Esempio n. 6
0
def approx_nsyl(word):
	"""Credit - Jason Sundram, http://runningwithdata.com/post/3576752158/w
	Return the max syllable count in the case of multiple pronunciations"""
	d = cmudict.dict()
	if word not in d.keys():
		return 0
	return max([len([y for y in x if y[-1].isdigit()]) for x in d[word.lower()]])
def group_rhyming_tweets(filtered_tweet_list):
    """groups rhyming tweets into lists, then returns a list containing those lists. lists are sorted so that the list with the most rhyming words
    is first in the list."""
    copy_filtered_tweet_list = list(filtered_tweet_list)
    dictionary = cmudict.dict()
    grouped_rhyming_tweets = []
    index = 0
    while (
        index < len(copy_filtered_tweet_list) - 1
    ):  # don't need to check last element for rhymes against other words b/c all pairs of words checked already by that point
        rhyme_list = [copy_filtered_tweet_list[index]]
        i = index + 1
        while i < len(copy_filtered_tweet_list):
            if (
                do_sentences_rhyme(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i], dictionary)
                or sentence_rhyme_score(copy_filtered_tweet_list[index], copy_filtered_tweet_list[i]) > 4
            ):
                rhyme_list.append(copy_filtered_tweet_list[i])
                copy_filtered_tweet_list.pop(i)
                i = i - 1
            i = i + 1
        rhyme_list = list(set(rhyme_list))  # remove non-unique entries by converting to a set and back again
        grouped_rhyming_tweets.append(rhyme_list)
        index = index + 1
    # grouped_rhyming_tweets = sorted(grouped_rhyming_tweets, key = len, reverse = True)
    grouped_rhyming_tweets = [i for i in grouped_rhyming_tweets if len(i) > 1]
    return grouped_rhyming_tweets
Esempio n. 8
0
	def compile_meter_list(self, new_words, verbose=True):
	    # simplifies and compiles cmu cormpus info into listed list
	    iambic = cmudict.dict()                     # connect to cmu corpus, called iambic
	    big_list = []                               # list to collect all the different versions of words and their meter
	    for word in new_words:                      # get word from list of clean words
	        syl_num = sylco([word])
	        word_n_versions_list = []               # list has each word and the different versions
	        word_n_versions_list.append(word)       # add word
	        versions_list = []                      # list of all diff versions
	        try:                                    # if word is in corpus
	            for n,x in enumerate(iambic[word.lower()]): # get versions for each word
	                version = []                    # list for each version
	                version.append(word+str(n))     # add word+version
	                meter_list = []                 # list holds word version's meter
	                for y in x:                     # for word in cmu-dict sent
	                    for char in y:              # for character in word
	                        if char.isdigit() == True: # if the char is a number
	                            meter_list.append(int(char)) # add number to meter
	                version.append(meter_list)      # add meter to the word version
	                versions_list.append(version)   # add all the versions to one list
	            word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
	            big_list.append(word_n_versions_list)       
	        except:                                 # if word isnt in corpus
	            version = []                        # empty version
	            version.append(word+str(0))         # add word1
	            meter_list = []                     # empty meter list
	            if len(syl_num) == 1:
	                for syl in range(syl_num[0]):          # for each syllable...
	                    meter_list.append(-1)           # add 0 to meter_list
	                version.append(meter_list)          # add empty meter list to version
	                versions_list.append(version)       # add version w/ word1 to versions list
	                word_n_versions_list.append(versions_list) # add list of diff versions to word and versions list
	                big_list.append(word_n_versions_list) # adds word and versions to big list
	    return big_list
Esempio n. 9
0
    def on_load(self):
        print "Loading: " + self.__class__.__name__
        wd = self.context.getWorkingDir()
        nltk.data.path.append(wd + "nltk_data")

        self.d =  cmudict.dict()
        pass
Esempio n. 10
0
def make_cmu_wordlist():
    """
    Strip the CMU Pronunciation Dictionary of accent marks.

    Add '$' to the end of strings (for markov chain use).

    Pickle and dump to 'cmu.p'.
    """
    d = cmudict.dict()
    pronunciation_list = d.values()

    edited_list = []
    for entry in pronunciation_list:
        for word in entry:
            edited_word = ["#"]
            for i in xrange(len(word)):
                #remove accent marks
                edited_word.append(word[i].rstrip('0123456789'))
                
            #Use '$' to mark the end of words
            edited_word.append('$')
            edited_list.append(edited_word)

#    with open('wordlists/cmu.p', 'w') as outfile:
#        pickle.dump(edited_list, outfile)
    
    return edited_list
Esempio n. 11
0
  def __init__(self):

    # generate n2w 
    self.n2w = gen_n2w()

    # syllable dict
    self.cmu = cmudict.dict()
Esempio n. 12
0
def parse_sentence(sent, syl=partial(syllabify, English),
                   pron_dict=cmudict.dict()):
    sent = sent.strip()
    if not len(sent):
        return
    tokens = list(filter(len, map(preprocess, sent.split())))
    phonemes = (map(syl, pron_dict[t]) for t in tokens)

    nsyllables = set()
    final_sounds = set()
    for words in product(*phonemes):
        if not len(words):
            return

        # Count the number of syllables and extract the stress pattern.
        stress, syllables = zip(*((s[0], s[1:]) for w in words for s in w))

        # Compute the final sound.
        final_syllable = syllables[-1]
        if len(final_syllable[2]):
            final_sound = "_".join(map("_".join, final_syllable[1:]))
        elif len(final_syllable[0]):
            final_sound = "{0}_{1}".format(final_syllable[0][-1],
                                           "_".join(final_syllable[1]))
        else:
            final_sound = "_".join(final_syllable[1])

        # Update the possible versions for this sentence.
        nsyllables.add(len(stress))
        final_sounds.add(final_sound + "_{0}".format(int(stress[-1] > 0)))

    return nsyllables, final_sounds, [tokens[-1]]
Esempio n. 13
0
def fix_db():

    print "* Executing database FIX procedure..."

    # connect to db
    mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL")
    client = pym.MongoClient(mongodb_url)
    db = client["shalk"]
    coll = db["ngrams"]

    base_data_dir = os.getenv("OPENSHIFT_DATA_DIR")
    if not base_data_dir:
        base_data_dir = "../data/"

    # initialize cmu dict
    nltk.data.path = ["{0}nltk/".format(base_data_dir)]
    cdict = cmudict.dict()

    count = 0
    upcount = 0
    mod = 100

    # iterate over all docs that need fixing
    orlist = [
        {"syllables": {"$exists": False}},
        {"rand": {"$exists": False}},
        {"type": {"$exists": False}},
        {"rhyme": {"$exists": False}},
    ]
    ngrams = coll.find({"$or": orlist})
    total = ngrams.count()

    for ngram in ngrams:
        upngram = False
        lastword = get_last_word(ngram)

        if "syllables" not in ngram:
            upngram = True
            ngram["syllables"] = count_syllables(lastword, cdict)
        if "rand" not in ngram:
            upngram = True
            ngram["rand"] = random.random()
        if "rhyme" not in ngram:
            upngram = True
            ngram["rhyme"] = get_rhyme(lastword, cdict)

        if not upngram:
            count += 1
            continue

        update_ngram(ngram, db)

        upcount += 1
        count += 1
        if count % mod == 0:
            print "- {0} out of {1} analysed! Docs updated: {2}".format(count, total, upcount)
            sys.stdout.flush()

    print "* Database FIX procedure finished!"
Esempio n. 14
0
def does_rhyme_unit_test():    
    dictionary = cmudict.dict()
    print does_rhyme('lol','bol',2,dictionary)  
    print does_rhyme('cat','dog',2,dictionary)
    print does_rhyme('cat','bat',2,dictionary)
    print does_rhyme('cat','tot',2,dictionary)
    print does_rhyme('cat','tot',2,dictionary)
    print does_rhyme('hello','yellow',2,dictionary)
def does_rhyme_unit_test():
    dictionary = cmudict.dict()
    print does_rhyme("lol", "bol", 2, dictionary)
    print does_rhyme("cat", "dog", 2, dictionary)
    print does_rhyme("cat", "bat", 2, dictionary)
    print does_rhyme("cat", "tot", 2, dictionary)
    print does_rhyme("cat", "tot", 2, dictionary)
    print does_rhyme("hello", "yellow", 2, dictionary)
Esempio n. 16
0
def load_pronunciations(pronun_dictionary_name='cmudict', stress='unstressed'):
    """ note that we only support cmudict from nltk """
    if stress not in STRESS_OPTIONS:
        raise TypeError

    try: cmu = cmudict.dict()
    except LookupError, AttributeError:
        cmu = load_cmu_pickle()
Esempio n. 17
0
 def __init__(self, wav_folder):
     self.phones = {}
     self.get_wavs(wav_folder)
     # Initialise pronunciation dictionary (always add entries for punctuation symbols)
     self.pron_dict = dict.fromkeys(['.', '?', '!'], 'double_sil')
     self.pron_dict[','] = 'sil'
     self.whole_dict = cmudict.dict()
     self.get_pron_dict(args.phrase)
Esempio n. 18
0
def getMulti():
	cmu=cmudict.dict();
	rhymeToPros,pronunciationToWords=getDictionariesNeededForRhyming(cmu);
	print("rhymeToPros has "+str(len(rhymeToPros))+" items")
	print("pronunciationToWords has "+str(len(pronunciationToWords))+" items")
	rgs=[rhymeGroup(r,rhymeToPros,pronunciationToWords,syllabifier.syllabify) for r in rhymeToPros]
	multi=[r for r in rgs if (groupHasAtLeastOneDifference(r) and not(r.HasOneWord() or r.HasOnePronunciation()))]
	print("English has "+ str(len(multi))+" good rhyme groups\n")
	return multi
Esempio n. 19
0
def num_syllables(word):
    d = cmudict.dict()
    if "-" in word:
        word2 = "".join(word.split("-"))
        if word2 in d:
            word = word2
        else:
            return sum([num_syllables(w) for w in word.split("-")])
    return list((len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]))[0]
def test_num_syllables():
	s = cmudict.dict()
	tests = ['animal', 'i', '0', 1]
	expected = [3, 1, 1, 1]
	results = []
	for t in tests:
		results.append(num_syllables(t, s))
	rval = 1.0 * sum([e == r for e, r in zip(expected, results)]) / len(expected)
	return rval
Esempio n. 21
0
def set_up_globals(ono=True):
    global dictionary
    dictionary = cmudict.dict()
    global stressed
    stressed = "1"
    global unstressed
    unstressed = "0"
    if ono:
        setup_ono_type_map()
Esempio n. 22
0
def main():
    '''Generates the tracery grammar for @my_cat_ebooks.'''
    logging.basicConfig(
        level='INFO',
        format='%(asctime)s %(levelname)8s [%(name)s] %(message)s',
    )

    log.info('Loading CMU pronounciation dictionary')
    global cmu_pronounciations
    cmu_pronounciations = cmudict.dict()

    log.info('fruits')
    fruits = load_corpus("foods/fruits.json")["fruits"]

    log.info('body parts')
    body_parts = load_corpus("humans/bodyParts.json")["bodyParts"]

    log.info('amazing')
    amazing = load_corpus("words/encouraging_words.json")["encouraging_words"]

    log.info('superstar')
    superstar = [ln for s in wordnet.synsets('superstar') for ln in s.lemma_names()]

    pronouns = [
        "[he:he][him:him][hes:he's]",
        "[he:she][him:her][hes:she's]",
        # TODO: reintroduce this, but it affects the conjugation of the occupation.
        #
        #   they may not be an cleaner
        #   they cleanses exultantly
        #
        # "[he:they][him:them][hes:they're]",
        "[he:it][him:it][hes:it's]",
    ]

    grammar = {
        "atrociously": adjly("atrocious"),
        "watermelon": fruits,
        "seven": "two three four five six seven eight nine ten eleven twelve".split(),
        "arm": body_parts,
        "amazing": amazing,
        "guitar": instruments(),
        "superstar": superstar,
        "setPronouns": pronouns,
        "setOccupation": occupations(),
        "stanza": [
            textwrap.dedent(s).strip()
            for s, weight in stanza_weights.iteritems()
            for _ in xrange(weight)
        ],
        "origin": ["#[#setPronouns#][#setOccupation#]stanza#"],
    }

    log.info('writing grammar')
    with open('grammar.json', 'w') as f:
        json.dump(fp=f, indent=2, obj=grammar, sort_keys=True)
Esempio n. 23
0
def recover_file_to_db(datafile):

    filename = datafile.rsplit("/")[-1]
    print "* Recovering file [{0}] into db...".format(filename)

    # connect to db
    mongodb_url = os.getenv("OPENSHIFT_MONGODB_DB_URL")
    client = pym.MongoClient(mongodb_url)
    db = client["shalk"]
    coll = db["ngrams"]

    base_data_dir = os.getenv("OPENSHIFT_DATA_DIR")
    if not base_data_dir:
        base_data_dir = "../data/"

    # initialize cmu dict
    nltk.data.path = ["{0}nltk/".format(base_data_dir)]
    cdict = cmudict.dict()

    count = 0
    mod = 1000

    # open file in reverse, and import it until we find the point where we stopped
    ngrams = []
    for line in reversed(open(datafile).readlines()):
        ngram = get_ngram(line, cdict)

        if not ngram:
            continue

        # stop we find this ngram in the db already
        if find_one(ngram, db):
            # if `force`, we will iterate over all docs, but will ignore the ones that are already inserted
            if args.force:
                print "- ({0}) Ngram [{1}] already in the db, jumping to the next one...".format(filename, ngram)
                sys.stdout.flush()
                continue

            print "- ({0}) Ngram [{1}] already in the db, stopping the recovery!".format(filename, ngram)
            sys.stdout.flush()
            break

        ngrams.append(ngram)
        count += 1
        if count % mod == 0:
            print "- ({0}) Inserted [{1}] ngrams into db...".format(filename, len(ngrams) * (count / mod))
            print "- ({0}) {1} -> {2}".format(filename, ngrams[0], ngrams[-1])
            sys.stdout.flush()
            insert_ngrams(ngrams, db)
            ngrams = []

    print "- ({0}) Inserting last [{1}] ngrams into db...".format(filename, len(ngrams))
    sys.stdout.flush()
    insert_ngrams(ngrams, db)

    print "* Finished importing file [{0}]!".format(filename)
Esempio n. 24
0
    def __init__(self, wav_folder):
        self.out = SA.Audio(rate=16000) # Create a blank audio for output, with a frequency of 16000

        self.phones = self.get_wavs(wav_folder) # Add wavs as audio objects for each phoneme
                                                # and additional elements for pause breaks

        self.add_phone_break('comma - break', 250)
        self.add_phone_break('sentence - break', 500)

        self.word_phones_dict = cmudict.dict()
Esempio n. 25
0
 def __init__(self):
     self.dict = cmudict.dict()
     self.unknown_dict = {}
     for key in self.dict.keys():
         if "'" in key:
             self.unknown_dict[key.replace("'", '')] = key
         if key.endswith('ing'):
             self.unknown_dict[key.replace('ing', 'in')] = key
         if 'every' in key:
             self.unknown_dict[key.replace('every', 'evry')] = key
def SyllableCalculator(text):
    d = cmudict.dict()
    counter = 0.0
    tokens = re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",text)
    for token in tokens:
        count = 1.0
        if token.lower() in d.keys():
            count =max([len(list(y for y in x if isdigit(y[-1]))) for x in d[token.lower()]])
        counter = counter+count
    return counter
Esempio n. 27
0
    def __init__(self):
        """Instantiate a MinPairFinder.

        Note; To avoid unnecessarily repeating the work of loading the dict,
        call `get_instance` instead.
        """
        if not self._dict:
            self._dict = cmudict.dict()
        if not self._rhymes_dict:
            self._rhymes_dict = self._get_rhymes_dict()
Esempio n. 28
0
        def text_to_phoneme_2(text): # different format for speech
          phoneme_dict = cmudict.dict()
          text = ""
          for word in raw_english:
            syllable = phoneme_dict[word][0] # there should be a counter somewhere for each phonemic version
            syllable = '-'.join(syllable)
            text = text + syllable + "- -"

          text = "-" + text
          return text
Esempio n. 29
0
    def get_rhymes(self, word):
        rhymes = []

        word_pronounciations = cmudict.dict()[word]
        for word_pronounciation in word_pronounciations:
            for rhyme, rhyme_pronounciation in cmudict.entries():
                if rhyme_pronounciation[-1] == word_pronounciation[-1]:
                    rhymes.append(rhyme)

        return rhymes
def transcribeWord(word):
    dict = cmudict.dict()

    if word in dict:
        pronunciations = dict[word]
        syllables = pronunciations[0]
        pronunciation = ' '.join(syllables)
        return pronunciation
    else:
        return False
Esempio n. 31
0
    get_syllable_dict,
    get_track_str,
)
from haikuincidence.utils.haiku_utils import count_syllables, get_haiku
from haikuincidence.utils.text_utils import clean_text

# get data to use for dealing with tweets
track_str = get_track_str()
ignore_tweet_list = get_ignore_tweet_list()
syllable_dict = get_syllable_dict()
emoticons_list = get_emoticons_list()

# Use inflect to change digits to their English word equivalent
inflect_p = inflect.engine()
# Use the CMU dictionary to count syllables
pronounce_dict = cmudict.dict()

# guess_syl_method = "min"
guess_syl_method = "mean"
# guess_syl_method = "max"


def get_syllable_count_and_haiku(text):
    count = count_syllables(
        text,
        inflect_p,
        pronounce_dict,
        syllable_dict,
        emoticons_list,
        guess_syl_method,
    )
Esempio n. 32
0
import nltk
from nltk.corpus import cmudict
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize
import numpy as np
import os.path
import pandas as pd
import pickle
import random
import re
import requests
import string
import sys


# dictionary to look up pronounciations
master_dict = cmudict.dict()


def save_obj(obj, fname):
    with open(fname, 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)


def load_obj(fname):
    with open(fname, 'rb') as f:
        return pickle.load(f)


class Poem(object):

    def __init__(self, text, fname=None):
Esempio n. 33
0
def get_syllables(word):
    d = cmudict.dict()
    return [
        len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]
    ][1]
Esempio n. 34
0
brown.categories()  # the categories of texts in the corpus
brown.fileids(categories="hobbies")  # text files in the category of "hobbies"

brown.raw()[:100]  # the raw text of the brown corpus (note tags)
brown.sents()[0]  # first sentence in the corpus
brown.words()[:10]  # first ten words in the corpus
brown.tagged_sents()[
    0]  # first sentence, each word tagged with part-of-speech info
brown.tagged_words()[:50]  # first fifty words, all tagged

##-- Specialized corpus: cmudict --##

##     The Carnegie Mellon University Pronouncing Dictionary
##     over 130,000 words, includes stress and variant pronunciations

cmudict.dict()['idiosyncratic']
cmudict.dict()['caravan']

### ~~~~~~~~~~~~~~~~~~~~~~~ ###
### 2. Tokenizing Sentences ###
### ~~~~~~~~~~~~~~~~~~~~~~~ ###
###
### - Breaking a sentence string into tokens (words, etc.)
###
### - A few common issues:
###     Punctuation
###     Contractions (e.g., can't)
###     Non-alphabetical words

sent = "I don't want a blueberry cake... I want a vanilla-almond cake!!!"
Esempio n. 35
0
def starts_with_vowel_sound(word, pronunciations=cmudict.dict()):
    for syllables in pronunciations.get(word, []):
        return syllables[0][-1].isdigit()  # use only the first one
Esempio n. 36
0
if not exists(NLTK_DATA_PATH):
    for datum in NLTK_DATA:
        nltk.download(datum)

from nltk.stem.snowball import EnglishStemmer
import nltk.chunk as chunk
from nltk.corpus import cmudict

DIVIDER_TAG = ':'  # nltk uses this to tag for ; and :

# Set up some state that we'll use in the functions throughout this file:
# TODO consider making a class that has modular stemmer/tokenizer
stemmer = EnglishStemmer()
tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
cmudict_dict = cmudict.dict()

# Some useful regexes:
vowel_re = re.compile("[aeiouAEIOU]")
vowel_phoneme_re = re.compile("AA|AE|AH|AO|AW|AY|EH|EY|ER|IH|IY|OW|OY|UH|UW")
consonant_phoneme_re = re.compile(
    "^(?:B|D|G|JH|L|N|P|S|T|V|Y|ZH|CH|DH|F|HH|K|M|NG|R|SH|TH|W|Z)")

# Helper predicates:
is_vowel = partial(match, vowel_re)
is_vowel_phoneme = partial(match, vowel_phoneme_re)
is_consonant_phoneme = partial(match, consonant_phoneme_re)


def word_to_phonemes(word):
    result = cmudict_dict.get(word.lower(), None)
Esempio n. 37
0
from nltk.corpus import cmudict

from HMM import unsupervised_HMM
from helper import *

# PREPROCESSING
# text = open(os.path.join(os.getcwd(), 'data/shakespeare.txt')).read()
text = open(os.path.join(os.getcwd(), 'data/allpoems.txt')).read()
# visualization of whole data set
wordcloud = text_to_wordcloud(text, title='Shakespeare')
# TODO: extract words
# - keep hyphenated words hyphenated
# - some words could be tokenized as bigrams
# - separate punctuation from words, and store them separately
obs, obs_map = parse_observations(text)
syllables = cmudict.dict()
for punct in [".", ",", ":", ";", "!", "?"]:
    syllables.update({punct:[[]]})

# UNSUPERVISED LEARNING
#Was 20
hmm8 = unsupervised_HMM(obs, 10, 100)

# visualizations of sparsity of A, O as well as
# visualizations of states as wordclouds
visualize_sparsities(hmm8, O_max_cols=50)
wordclouds = states_to_wordclouds(hmm8, obs_map)

#This part only works in Jupyter Notebook
anim = animate_emission(hmm8, obs_map, M=8)
HTML(anim.to_html5_video())
Esempio n. 38
0
def cut_tweet_to_syllables_unit_test():
    dictionary = cmudict.dict()
    print cut_tweet_to_syllables(
        'damn n***a look at all these syllables tho for real', 10, dictionary)
Esempio n. 39
0
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet as wn
from nltk.corpus import cmudict
from nltk.stem import WordNetLemmatizer
import csv

#initialization needed for using WordNetLemmatizer and cmudict
stemmer = nltk.PorterStemmer()
pronunciations = cmudict.dict()
lemma = WordNetLemmatizer()


#return true if the word has more than one meaning
def is_homo(word):
    if (len(wn.synsets(word)) > 1):
        return True
    return False


#return true if the word has more than one pronunciation
def is_hetero(word):
    if (word in pronunciations.keys()):
        if len(pronunciations[word]) > 1:
            return True
    return False


#return true if given word is a verb with no meaning such as be, do, have
def is_general(word):
    if (word == "was" or word == "been" or word == "are" or word == "did"
Esempio n. 40
0
class AmericanEnglishLangContext(LanguageBoundsInterface):
    """Defines the properties and implementation of standard American English."""

    ########## Variables ##########

    _cmu = cmudict.dict(
    )  # Pretrained phenome generation model. Created outside of methods because it is used over iteration(s) and is expensive to generate; TREAT THIS VALUE AS AN IMMUTABLE.
    _MULTI_TOKEN_INDICATOR = "_"  # Character used to identify when a token has multiple words. This functionality is specific to a corpus. Must be changed if corpus is changed.
    _NULL_PHENOME_INDICATOR = "*NONE*"  # Used by algorithm to indicate if a corressponding phemone could not be found for a token
    _SIMILARITY_THRESHOLD = 0.2  # The threshold that must be passed for a word to be considered similar. Scaled from 0-1.
    vowelphenomes = [
        "AA", "AE", "AH", "AO", "AW", "AY", "AX", "AXR", "EH", "ER", "EY",
        "IH", "IX", "IY", "OW", "OY", "UH", "UW", "UX"
    ]  # Contains all phenomes that produce vowel-related sounds for this language.

    ###############################

    def _getproperformattype(self, unformattoken):
        """Used to parse through the Wordnet sysnet-token return value to retrieve only relevant sections. Currently the only returns the word.
        In future implementations, this function may not be needed if the corpus has a function to return only the word as a string."""

        name, junk = unformattoken.name().split(".", 1)
        return name

    def _getproperhandlemissingphenome(self, unknowntoken):
        """Takes a unknown-phenome (a token which could not be evaluated by CMUdict) and attempts to generate a phenome. If CMUdict or
        Wordnet implementation is changed this function MUST be changed."""

        finaleval = []

        # After various testing, it has been determined that calculating for two letters yields the most consistent results for unknown phenomes.
        tokenlen = len(unknowntoken)
        if tokenlen is 0:
            finaleval.append([self._NULL_PHENOME_INDICATOR])
        elif tokenlen is 1:
            finaleval.append([unknowntoken.upper()
                              ])  # The letter IS the phenome
        else:
            relevant = unknowntoken[:2]  # get first two chars
            finalattempt = self._cmu.get(relevant, None)

            if finalattempt is None:  # No possible phenome can be generated by this algorithm
                finaleval.append([self._NULL_PHENOME_INDICATOR])
            elif finalattempt is list:
                finaleval.append(finalattempt)
            else:  # 'finalattempt' is guareenteed to only be of type NONE, list, or list[list].
                finaleval.extend(
                    finalattempt
                )  # flatten list; tis step is necessary to maintain parsability

        return finaleval

    def _getproperhandlemultitoken(self, multitoken):
        """Takes a multi-word (a token with words seperated by '_' by Wordnet) and breaks it down into a format that can be evaluated by the CMUdict. If CMUdict or
        Wordnet implementation is changed this function MUST be changed."""

        finaleval = []
        individualtokens = multitoken.split(self._MULTI_TOKEN_INDICATOR)

        for token in individualtokens:  # evaluate each token phenome indiviually; then represent multitoken for EACH phenome calculated, when returned to scanning.
            phenome = self._cmu.get(token.lower(), None)

            if phenome is list:
                finaleval.append(phenome)

            else:  # 'phenome' is guareenteed to only be of type NONE, list, or list[list].
                if phenome is None:
                    phenome = self._getproperhandlemissingphenome(token)

                finaleval.extend(
                    phenome
                )  # flatten list; this step is necessary to maintain parsability

        return finaleval

    def getphenomes(self, arg):
        """Returns all phenome-lists related to the token. ('context' is the representation of the phrase in collection form.)"""

        # uses CMUdict as the core processing algorithm. If CMUdict fails to find a match the function will predict a possible phenome for the token.
        # This function is guareenteed to return a value.

        generatephenome = self._cmu.get(
            arg.lower(), None
        )  # _cmu is defined globally above in "VARIABLES" section. Treat as an immutable.
        if generatephenome is None:
            if arg.__contains__(
                    self._MULTI_TOKEN_INDICATOR
            ):  # _MULTI_TOKEN_INDICATOR is defined globally above in "VARIABLES" section. Treat as an immutable.
                generatephenome = self._getproperhandlemultitoken(arg)

            else:  # token is unknown by CMUdict
                generatephenome = self._getproperhandlemissingphenome(arg)

        # When multiple phenomes exist for same word, a list[list[str]] is generated
        return generatephenome

    def hypernyms(self, context, arg):
        """Returns all hypernyms related to the token. ('context' is the representation of the phrase in collection form.)"""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        eval = None
        interpretation = lesk(context, arg)
        if interpretation is not None:
            eval = map(self._getproperformattype, interpretation.hypernyms())

        return eval

    def hyponyms(self, context, arg):
        """Returns all hyponyms related to the token."""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        eval = None
        interpretation = lesk(context, arg)
        if interpretation is not None:
            eval = map(self._getproperformattype, interpretation.hyponyms())

        return eval

    def messagefail(self, input):
        """Produces the fail message to print to users in this language if the process cannot return a value."""
        built = " ".join(input)
        return (
            "Your input: '" + built +
            "' was not able to be parsed under the conditions you desired. Please try new conditions or try a new phrase."
        )

    def messageonlyresult(self, arg):
        """Produces a indicator message if only one result was possible from the input parameters given."""
        return ("This is the only result processed from the given input:\n" +
                arg)

    def messagetopresult(self, resultlen, requestedresultcount):
        """Produces the top 'x' results message to users in this language if the process has multiple results."""
        if resultlen < requestedresultcount:
            return ("Top " + str(resultlen) + " result(s):\n")
        else:
            return ("Top " + str(requestedresultcount) + " result(s):\n")

    def similarity(self, contextclues, arg1, arg2):
        """Returns a key-value pair for scoring similarity. [0] a bool that determines if the word is similar enough to satisfy language criteria
        and the score associated with the evaluation."""

        # This function assumes the use of Wordnet. If Wordnet implementation changes, this function MUST change.

        evaluation = False
        score = 0

        if arg1 is arg2:
            evaluation = True
            score = self._SIMILARITY_THRESHOLD  # Penalizing score to prevent paraphrases from returning themselves

        else:
            contextA = lesk(contextclues, arg1)
            contextB = lesk(contextclues, arg2)

            if contextA and contextB:  # Otherwise score will stay zero
                score = contextA.path_similarity(contextB)

                if score is not None and self._SIMILARITY_THRESHOLD <= score:
                    evaluation = True

        return (evaluation, score)

    def split(self, arg):
        # Returns all non-whitespace tokens.
        return RegexpTokenizer('\w+|\$[\d\.]+|\S+').tokenize(arg)
Esempio n. 41
0
 def __init__(self):
     self.cmudict = cmudict.dict()
Esempio n. 42
0
def do_sentences_rhyme_unit_test():
    dictionary = cmudict.dict()
    print do_sentences_rhyme('oh hello', 'no yellow', dictionary)
    print do_sentences_rhyme('so the dog', 'log', dictionary)
    print do_sentences_rhyme('potato', 'wefo', dictionary)
    print do_sentences_rhyme('hog', 'log', dictionary)
        for i, tag in enumerate(tag_list):
            spelling, sense, pron = tag
            info = '(' + str(sense) + ',' + pron + ')'
            loc = sent.find(spelling, current_loc)
            sent = sent[:loc + len(spelling)] + info + sent[loc +
                                                            len(spelling):]
            current_loc = loc + len(spelling) + len(info)
        format_output['sentence'][row_idx] = sent
        print(sent, '--- Source: ', row['citation'])

    format_output.to_csv(filename, index=False, header=0)
    return


## Set up basic corpora
pron_dict = cmudict.dict()
brown_words = brown.tagged_words(tagset='universal')
treebank_words = treebank.tagged_words(tagset='universal')
nps_words = nps_chat.tagged_words(tagset='universal')
corpus = brown_words + treebank_words + nps_words
corpus = [(word.lower(), tag) for (word, tag) in corpus]
stopset = set(stopwords.words('english'))
## Set up pretrained spaCy's word vector
nlp = spacy.load('en_core_web_lg')

## Collect potential heteronyms
data = get_het_from_corpus(corpus)

## Assign Wiktionary data to the potential heteronyms
parser = init_wikparser()
data = get_pronunciation(parser, data)
Esempio n. 44
0
"""
Classes and utilities for extracting haiku from arbitrary text and evaluating them based on some programmatically
defined criteria
"""
import nltk
import string
from nltk.corpus import cmudict
from nltk_util import syllables_en
from haikus.evaluators import DEFAULT_HAIKU_EVALUATORS

global WORD_DICT
try:
    WORD_DICT = cmudict.dict()
except LookupError:
    nltk.download('cmudict')
    WORD_DICT = cmudict.dict()


class NonwordError(Exception):
    pass


class HaikuText(object):
    """
    A wrapper around some sequence of text
    """
    def __init__(self, text=None):
        self._text = text

    def get_text(self):
        return self._text
Esempio n. 45
0
def unit_test_count_syllables_sentence():
    dictionary = cmudict.dict()
    print count_syllables_sentence('hello please check my syllables',
                                   dictionary)
    print count_syllables_sentence('checking some syllables right now dog',
                                   dictionary)
Esempio n. 46
0
from __future__ import print_function
from __future__ import division
from scipy.integrate import quad
import random
import numpy as np
import codecs
import string
import re
import cPickle as cp
from nltk import pos_tag
from nltk import word_tokenize
import collections

from nltk.corpus import cmudict

d = cmudict.dict()  # dicionary of syllables from cmudict


def check_unique(unique):
    out = open('unique.txt', 'w')

    for i in unique:
        out.write(i + '\n')
    out.close()


# parses the text file 'shakespeare.txt' and adds each unique word to a dictionary,
# WORD_DIC, with a unique index
def parse(word_dic, index_dic):

    # open 'shakespeare.txt'
    'date_of_publication', 'num_of_words', 'num_of_non_empty_lines',
    'num_of_verses', 'avg_word_len', 'avg_line_len', 'avg_lines_per_verse',
    'longest_line', 'words_per_line', 'largest_word',
    'poem_stress_list_no_punct', 'chars_per_line'
])
#
# Load JSON
#

with open(DATA_DIR + READ_JSON_FILE, 'r') as infh:

    cnt = 0
    no_lines = 0

    largest_word_corpus_ls = []
    prondict = cmudict.dict()

    # for every poem-file-object
    for data in import_utilities.json_parse(infh):
        # process object
        cnt = cnt + 1
        #print "cnt:", cnt
        labels_ls = []

        author = 'UNKNOWN'
        title = 'UNKNOWN'

        # get the data out of json
        for idx, val in enumerate(data):

            #print idx, val
Esempio n. 48
0
from nltk.corpus import cmudict
from nltk.tokenize import RegexpTokenizer
import os.path, time
import datetime
import common
import rssNewsFetcher
import pickle

d = cmudict.dict()  # get the CMU Pronouncing Dict
phrasetokenizer = RegexpTokenizer(r"[\w| |\-|\'|\‘|\’|\$]+")
wordtokenizer = RegexpTokenizer(r"[\w+|\']+")
soundtokenizer = RegexpTokenizer(r"[A-Z]+")


def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)


def nsyl(word):
    """return the max syllable count in the case of multiple pronunciations"""
    lastsound = ''
    syllables = 0
    try:
        if isinstance(d[word.lower()], list):
            word = d[word.lower()][0]
        for sound in word:
            if hasNumbers(sound):
                syllables = syllables + 1
                lastsound = ''
            #append last sound
            lastsound += soundtokenizer.tokenize(sound)[0]
Esempio n. 49
0
kyle_tokens = kyle_quotes_lower.apply(nltk.word_tokenize)
#kyle_quotes.head()

kyle_tokens_list = [
    word for inner_list in list(kyle_tokens) for word in inner_list
]
kyle_tokens_list = [
    re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i) for i in kyle_tokens_list
]
kyle_lexical_diversity = len(set(kyle_tokens_list)) / len(kyle_tokens_list)
#print(kyle_lexical_diversity)
#len(kyle_tokens_list)/len(kyle_tokens)

top_characters = quotes_by_character.count()[
    quotes_by_character.count().Line > 1000].index
pro_dict = cmudict.dict()


def get_character_params(data, character):

    character_quotes = data[data.Character == character].Line
    character_quotes_lower = character_quotes.apply(str.lower).apply(
        str.rstrip, '\n')
    character_tokens = character_quotes_lower.apply(nltk.word_tokenize)
    character_tokens_list = [
        word for inner_list in list(character_tokens) for word in inner_list
    ]
    character_tokens_list = [
        re.sub(r'[^A-Za-z0-9\'\-{1}]+$|\'$', 'punc', i)
        for i in character_tokens_list
    ]
Esempio n. 50
0
## SETUP
parser = argparse.ArgumentParser()
parser.add_argument('-r',
                    '--rhyme',
                    dest='rhyme',
                    help='provide a word to find its rhymes')
parser.add_argument('-p',
                    '--phones',
                    dest='phones',
                    help='provide a word/sentence to see its phonemes')
args = parser.parse_args()

## START
l_ents = cmudict.entries()  # "list" of entries
d_ents = cmudict.dict()  # "dict" of entries

# if they are using command line args, single usage mode
if len(argv) > 1:

    if args.rhyme:
        get_rhymes(args.rhyme)

    if args.phones:
        get_phones(args.phones)

# interactive mode with repeating menu and options
else:
    inp = ''
    while inp != 'q':
        print('\n(1) Find phonemes\n(2) Find rhyming words\n(q) Quit\n')
Esempio n. 51
0
    sys.path.append(nlp_dir)
import util

VERSION_MAJOR = 0
VERSION_MINOR = 7

MODULE_NAME = 'termset_expander.py'

global DEBUG
DEBUG = False

# load Spacy's English model
nlp = spacy.load('en_core_web_sm')

# initialize the CMU phoneme dictionary
cmu_dict = cmudict.dict()

# regexes for locating termsets in NLPQL files

# line comment - match everything from // up to but NOT including the newline
# also, don't match the // in a URL
str_line_comment = r'(?<!http:)(?<!https:)//.*(?=\n)'
regex_line_comment = re.compile(str_line_comment, re.IGNORECASE)

# multiline comment
str_multiline_comment = r'/\*.*?\*/'
regex_multiline_comment = re.compile(str_multiline_comment,
                                     re.IGNORECASE | re.DOTALL)

# a term is anything enclosed in double quotes
str_term = r'\"[^"]+\"'
Esempio n. 52
0
import Models
from nltk.corpus import cmudict
""" Global variables for the reading level project.

This module holds the global variables for the project.

Authors:
    Charles Billingsley
    Josh Getter
    Adam Stewart
    Josh Techentin

"""

# Main Globals
dictionary = cmudict.dict()
input_file = ''
file_content = ''
current_line_number = 0
full_input = ''
total_words = 0
total_sentences = 0
total_syllables = 0
target_reading_level = ''
shouldModify = False

# ChangeLevel Globals
target_reading_level = ''
target_reading_score = Models.ReadingScoreRange()
Esempio n. 53
0
def __syllables__(word):
    print "Doing syllables lookup for", word
    d = cmudict.dict()
    if word == '':
        return 0
    return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word]][0]
Esempio n. 54
0
#!/usr/bin/python
import re
import inflect
import hyphenator
from nltk.corpus import cmudict

d = cmudict.dict()  #probably will need this
p = inflect.engine()


#this takes a word and separates its syllables so that they are hyphenated
#also capitalized
#if it takes a hyphenated word it should just return the hyphenated word (i have implemented this)
def hyphenate_word(word):
    word = '-'.join(hyphenator.hyphenate_word(word))
    if '--' in word:
        word = '-'.join(word.split('--'))
    return word


#returns a hyphenated version
def hyphenate_phrase(phrase):
    words = phrase.split(" ")
    returnme = []
    for word in words:
        returnme.append(hyphenate_word(word))
    print(" ".join(returnme))


def word_syllable_count(word):
    return hyphenate_word(word).count('-') + 1
Esempio n. 55
0
#-*- coding: utf-8 -*-

# Tools for working with poems
#
# Licensed under GPLv2 or later.

from __future__ import print_function
import json, os, re, sys
from collections import defaultdict
from string import ascii_lowercase
from Levenshtein import distance
from .countsyl import count_syllables

try:
    from nltk.corpus import cmudict
    cmu = cmudict.dict()
except:
    with open(os.path.join(os.path.dirname(__file__), 'cmudict/cmudict.json')) as json_file:
        cmu = json.load(json_file)

def elided_d(word):
    if word[-2:] == "'d":
        return word[:-2] + "ed"
    return word

def tokenize(poem):
    tokens = []
    for line in poem.split('\n'):
        line       = line.replace('-', ' ') # need to find a better tokenizer, but this works for now
        no_hyphens = line.replace('—', ' ') 
        cleaned    = re.sub(r'[^0-9a-zA-Z\s\']', '', no_hyphens) # keep apostrophes
Esempio n. 56
0
from collections import Counter
from nltk.corpus import words             #check dictionary
from nltk import pos_tag as posTag
import emoji #pip install
import re                           #elongation
from autocorrect import spell    #pip install     #check spelling
from nltk.tokenize import sent_tokenize #sentence tokenizer         https://www.nltk.org/api/nltk.tokenize.html also see
import csv                              #read file    
from datetime import datetime           #convert unix time to human time
from nltk.tokenize import RegexpTokenizer       #remove puncutations
from nltk import edit_distance as ed    #check word spelling correction distance
import urllib.request as urllib         #ud convert url to unicode
punctuations = RegexpTokenizer(r'\w+')
from nltk.corpus import cmudict
import math
CMUdict = cmudict.dict()      #syllable

class preProcess(object):
    
    def __init__(self):
        '''
        loads urban dictionary and emoji list
        '''
        self.ud=self.urbanLoad()
        self.emojiList=self.emojiLoad()
        
        
    def chanCleaner(self,post):          #clean 4archive posts
        '''
        cleans 4chan posts by removing the initial disclaimer
        '''
Esempio n. 57
0
import nltk
nltk.download('cmudict')

from nltk.corpus import cmudict
import numpy as np

d = cmudict.dict()


def syllable_count(word):
    try:
        return np.min([
            len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()]
        ])
    except KeyError:
        #if word not found in cmudict
        return _syllables(word)


def _syllables(word):
    #referred from stackoverflow.com/questions/14541303/count-the-number-of-syllables-in-a-word
    count = 0
    vowels = 'aeiouy'
    word = word.lower()
    if word[0] in vowels:
        count += 1
    for index in range(1, len(word)):
        if word[index] in vowels and word[index - 1] not in vowels:
            count += 1
    if word.endswith('e'):
        count -= 1
Esempio n. 58
0
from nltk.corpus import cmudict
from pattern.en import parse, parsetree, wordnet, NOUN, pluralize
from BasicModels import Error
import os
import settings
import logging

LOGGER = logging.getLogger("pattern.server")

PRON = cmudict.dict()
AEIOU = ['A', 'E', 'I', 'O', 'U']


#countabl features from celex
def readNounList(fileName):
    nounList = open(fileName, "r")
    raw = nounList.read().splitlines()
    maps = dict()
    for line in raw:
        data = line.strip().split("\t")
        key = data[0]
        cop = data[1]
        if len(data) != 14:
            print "Read list wrong!"
            sys.exit(0)
        if maps.has_key(key):
            tmp = maps.get(key)
            if cop > tmp:
                maps[key] = data[1:]
            else:
                pass
import re

import numpy as np
import pandas as pd
# cmudict的entries方法找出所有音素
import nltk
from nltk.corpus import stopwords  # 这个stopwords.words("english")
from nltk.corpus import cmudict
# import scikit-learn里面的两个计算tf-idf必要的类
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

# In[618]:

# 用cmudcit的hashmap版本,查询会更快
phonetic_check_dict = cmudict.dict()

# In[619]:

# cmudict.entries是一个list,里面的每个elements是一个tuple,tuple[0]是单词或字母,tuple[1]是对应的音素

# In[620]:


def get_data(filename):
    data_origin = pd.read_csv(filename, encoding='utf-8')
    return data_origin


# In[621]:
class SG:
    normalized_words = []
    pronunciation_tokens = []
    post_prosody = []
    cmud = cmudict.dict()
    sound_dict = sound_dict_generator.Synth().diphones

    # THIS IS USED FOR TESTING
    #def __init__(self):
        #self.normalized_words = ['<beginning>', '<question>', 'hello', 'there', 'professor', '<break,comma,1>', 'how', 'are',
        #                         'you', 'doing', '<break,question,2>', 'i', 'am', 'good', '<break,sent_end,2>',
        #                         '<exclamation>', 'This', 'is', 'so', 'amazing', '<break,exclamation,2>', '<end>']
        # self.normalized_words = ['doctor', 'rabbits', 'email', 'is', 'i', 'l', 'u', 'v', 'c', 'a', 'r' 'r',
        # 'o' 't' 's', 'three', 'zero', 'five', 'at', 'g', 'mail', 'dot', 'c', 'o', 'm', '<break,sent_end,2>', 'you',
        # 'can', 'checkout', 'his', 'website', '<break,comma,1>', 'r', 'a', 'b', 'b', 'i', 't', 'd', 'r', 'dot', 'g',
        # 'o', 'v', '<break,sent_end,2>', 'he', 'uses', 'forty', 'milliliters', 'beakers', 'to', 'find', 'tilde',
        # 'volume', '<break,sent_end,2>', 'he', 'has', '<currency>', 'negative', 'three', 'dollars', 'in', 'his',
        # 'bank', 'account', '<break,sent_end,2>']

    def __init__(self, n_w: list):
        self.normalized_words = n_w
        self.text_to_phoneme()
        self.prosody_analyzer()

    def text_to_phoneme(self):
        skip = 0
        for w in self.normalized_words:  # get the token from normalized_words
            if w in self.cmud:
                phone = self.cmud[w][0]  # convert tokens to its phoneme form
                for i in range(len(phone)):
                    phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower()
                self.pronunciation_tokens.append(phone)  # add the phoneme form of the word to pronunciation_tokens
            elif w[0] == '<' and w[-1] == '>':
                self.pronunciation_tokens.append([w])
            else:
                for i in range(len(w)):
                    if skip > 0:
                        skip -= 1
                        continue
                    try:
                        phone = self.cmud[w[i:i+5].lower()][0]
                        skip = 4
                    except:
                        try:
                            phone = self.cmud[w[i:i+4].lower()][0]
                            skip = 3
                        except:
                            try:
                                phone = self.cmud[w[i:i+3].lower()][0]
                                skip = 2
                            except:
                                try:
                                    phone = self.cmud[w[i:i+2].lower()][0]
                                    skip = 1
                                except:
                                    try:
                                        phone = self.cmud[w[i].lower()][0]
                                    except:
                                        pass
                    for i in range(len(phone)):
                        phone[i] = re.sub("[^a-zA-Z\\s\-]", "", phone[i]).lower()
                    self.pronunciation_tokens.append(phone)
                # TODO: figure out what to do with words not in the cmu dictonary
                #       Possibilities: should we get the root?, use the google converter?


    def prosody_analyzer(self):
        temp = []
        for w in self.pronunciation_tokens:
            if w[0] == "<beginning>" or w[0] == "<end>":
                temp.append('pau')
                if w[0] == "<end>":
                    temp.append(w[0])
            elif w[0] == "<break,comma,1>":
                temp.append('pau')
                temp.append('pau')
            elif w[0] == "<break,semicolon,1.5>" or w[0] == "<break,colon,1.5>":
                temp.append('pau')
                temp.append('pau')
                temp.append('pau')
            elif w[0] == "<break,sent_end,2>" or w[0] == "<break,question,2>" or w[0] == "<break,exclamation,2>":
                temp.append('pau')
                temp.append('pau')
                temp.append(w[0])
                temp.append('pau')
            elif w[0] == "<question>" or w[0] == "<exclamation>":
                temp.append(w[0])
            elif w[0] == "<space>":
                continue
            else:
                for p in w:
                    temp.append(p)
        for i in range(len(temp)):
            if temp[i] == "<exclamation>" or temp[i] == "<question>" or temp[i] == "<break,sent_end,2>" or temp[i] == "<break,question,2>" or temp[i] == "<break,exclamation,2>":
                self.post_prosody.append(temp[i])
                continue
            if i != len(temp)-1:
                if temp[i+1] == "<exclamation>" or temp[i+1] == "<question>" or temp[i+1] == "<break,sent_end,2>" or temp[i+1] == "<break,question,2>" or temp[i+1] == "<break,exclamation,2>" or temp[i+1] == "<end>":
                    if temp[i+1] == "<end>":
                        self.post_prosody.append(temp[i+1])
                    else:
                        self.post_prosody.append(temp[i] + '-' + temp[i + 2])
                else:
                    self.post_prosody.append(temp[i] + '-' + temp[i+1])
        print(self.post_prosody)