def phone_synthesize_solution(self, text): """ Synthesize speech from text by concatenating phonemes selected from the database """ # Convert all words to lower case words = [word.lower() for word in text.split()] phones = [] for word in words: try: # Use cmudict get phonemic representation phones.extend(cmudict.dict()[word][0]) except IndexError: # If word not found in dictionary, use g2p instead g2p = G2p() phones.extend(g2p(word)) print(phones) # Initialize an empty audio segment result = AudioSegment.empty() # Concatenate phonems selected from PHONEMES_DIR for phone in phones: # Ignore accent marker phone = phone[0:-1] if phone[-1].isdigit() else phone # Look up phoneme wav file using phone_map sound_label = phone_map[phone] sound_path = PHONEMES_DIR + str(sound_label) + ".wav" audio = AudioSegment.from_wav(sound_path) result += audio # Write the synthesized .wav file to DST_DIR result.export(DST_DIR + "gen.wav", format="wav") play(result)
def percetage_not_in_CMU(filename, verbose): total_words = 0 n_not_in_CMU = 0 dict = cmudict.dict() total_songs = 0 if verbose: print('Following words are not in CMU dictionary:') with open(filename, 'r') as input_file: data = json.load(input_file) for song in data: if verbose: print(f"SONG: {song['title']}") total_songs += 1 for line in song['lyrics']: last_word = line.strip().split(' ')[-1] last_word = last_word.lower() # Remove punctuation. last_word = re.sub(r"[^\w\d'\s]+", '', last_word) if last_word == '': continue pron = dict.get(last_word) if not pron and verbose: print(last_word) n_not_in_CMU += 1 total_words += 1 print(f'Analyzed {total_songs} songs.') return n_not_in_CMU / total_words
def test_estimate(): EXPECTED_ACCURACY = .75 hits = [] misses = [] d = cmudict.dict() for word in d: phones = d[word][0] cmudict_syllables = 0 for phone in phones: if re.match(r"\w*[012]$", phone): cmudict_syllables += 1 estimated_syllables = syllables.estimate(word) if cmudict_syllables == estimated_syllables: hits.append(word) else: misses.append(word) hit = len(hits) miss = len(misses) total = hit + miss ACCURACY = hit / total if (ACCURACY < EXPECTED_ACCURACY): raise AssertionError( 'syllables.estimate(): Expected accuracy of {0}, got {1}.'.format( EXPECTED_ACCURACY, ACCURACY))
def create_CMU_encoding_dictionary(): """ Create a encoding CMU Dictionary. Returns: dict: CMU encoding Dictionary """ return cmudict.dict()
def __init__(self, fileName = "lyrics.csv"): self.removePunct = re.compile(":|\[|\(|chorus|verse") self.actualFile = pd.read_csv(fileName, encoding="ISO-8859-1", engine='python') self.cleanedLyrics = list() self.rhymingDict = cmudict.dict() self.keysInDict = sorted(self.rhymingDict) self.markovString = ""
def test_dict(): EXPECTED_SIZE = 125997 d = cmudict.dict() SIZE = len(d) if (EXPECTED_SIZE != SIZE): raise AssertionError( 'cmudict.dict(): Expected {0} keys, got {1}.'.format( EXPECTED_SIZE, SIZE))
def __init__(self, data_file, character_level=None, phoneme_level=None, vocabulary=None, transform=None): self.data_file = data_file self.data = joblib.load(open(self.data_file, 'rb')) self.character_level = character_level self.phoneme_level = phoneme_level self.transcription_processor = lambda words: words if self.character_level: characters = [chr(c) for c in range(ord('a'), ord('z') + 1)] characters += [' '] character_vocab = Vocabulary() for character in characters: character_vocab.add_word(character) self.vocabulary = character_vocab self.transcription_processor = self._character_level_transcription_processor elif self.phoneme_level: cmu_phones = list(map(lambda x: x[0], cmudict.phones())) cmu_phones += [' '] phones_vocab = Vocabulary(custom_unk_word=' ') for phone in cmu_phones: phones_vocab.add_word(phone) self.vocabulary = phones_vocab self.phones_dict = cmudict.dict() self.transcription_processor = self._phone_level_transcription_processor elif vocabulary is None: data_file_dir = os.path.dirname(self.data_file) data_file_prefix = os.path.splitext(self.data_file)[0] pickle_file_name = f'{data_file_prefix}_SpeechDataset.pickle' pickle_file_path = os.path.join(data_file_dir, pickle_file_name) if not os.path.isfile(pickle_file_path): dataset_info = self.build_vocabulary_from_dataset(self.data) pickle.dump(dataset_info, open(pickle_file_path, 'wb')) else: dataset_info = pickle.load(open(pickle_file_path, 'rb')) self.vocabulary = dataset_info['vocabulary'] else: self.vocabulary = vocabulary self.transform = transform self.max_transcription_length = max([ len(transcription) for transcription in self.data['transcription_tokens'] ]) self.max_input_length = max([ spectrogram.shape[1] for spectrogram in self.data['audio_spectrograms'] ])
def getPsuedoKeyword(target_phone, already_present): a = cmudict.dict() b = cmudict.words() found = False for word in b: for lst in a[word]: for phone in lst: if(target_phone == phone and already_present.get(word) == None): already_present[word] = 1 return word if(re.search(target_phone,phone) and len(target_phone) !=1 and already_present.get(word) == None): already_present[word] =1 return word
def __init__(self): self.phones_for_word = cmudict.dict() self.id_to_word = list(self.phones_for_word.keys()) self.word_to_id = {word: i for i, word in enumerate(self.id_to_word)} self.id_to_phoneme = list(map(lambda x: x[0], cmudict.phones())) self.phoneme_to_id = { phoneme: i for i, phoneme in enumerate(self.id_to_phoneme) } self.root_node = {} for word, phoneme_sequences in self.phones_for_word.items(): word_id = self.word_to_id[word] for phoneme_sequence in phoneme_sequences: phoneme_sequence = map(self.remove_phoneme_numerals, phoneme_sequence) phoneme_sequence = list(phoneme_sequence) current_node = self.root_node for phoneme in phoneme_sequence[:-1]: phoneme_id = self.phoneme_to_id[phoneme] if not phoneme_id in current_node: current_node[phoneme_id] = { PhonemeTrie.WORDS_AT_PHONE: [word_id] } else: current_node[phoneme_id][ PhonemeTrie.WORDS_AT_PHONE].append(word_id) current_node = current_node[phoneme_id] terminating_phoneme = phoneme_sequence[-1] terminating_phoneme_id = self.phoneme_to_id[ terminating_phoneme] if not terminating_phoneme_id in current_node: current_node[terminating_phoneme_id] = { PhonemeTrie.WORDS_AT_PHONE: [word_id] } else: current_node[terminating_phoneme_id][ PhonemeTrie.WORDS_AT_PHONE].append(word_id) if not PhonemeTrie.TERMINATING_WORDS in current_node[ terminating_phoneme_id]: current_node[terminating_phoneme_id][ PhonemeTrie.TERMINATING_WORDS] = [] current_node[terminating_phoneme_id][ PhonemeTrie.TERMINATING_WORDS].append(word_id)
def main(): print("Start time: {0}".format(datetime.now().time())) cmu = cmudict.dict() all_transcribed_lines = dict() load_and_transcribe_poetry(all_transcribed_lines, cmu) print("Poetry Transcribed: {0}".format(datetime.now().time())) load_and_transcribe_lyrics(all_transcribed_lines, cmu) print("Lyrics Transcribed: {0}".format(datetime.now().time())) with open('transcribed_data.csv', mode='w') as out_file: out_writer = csv.writer(out_file, delimiter=',', quotechar='"') for line, val in all_transcribed_lines.items(): out_writer.writerow([line.encode('utf-8'), val]) print("Written to file: {0}".format(datetime.now().time()))
def test_dict_comments(): DICT = cmudict.dict() EXPECTED_DICT = { "d'artagnan": [['D', 'AH0', 'R', 'T', 'AE1', 'NG', 'Y', 'AH0', 'N']], "danglar": [['D', 'AH0', 'NG', 'L', 'AA1', 'R']], "danglars": [['D', 'AH0', 'NG', 'L', 'AA1', 'R', 'Z']], "gdp": [['G', 'IY1', 'D', 'IY1', 'P', 'IY1']], "hiv": [['EY1', 'CH', 'AY1', 'V', 'IY1']], "porthos": [['P', 'AO0', 'R', 'T', 'AO1', 'S']], "spieth": [['S', 'P', 'IY1', 'TH'], ['S', 'P', 'AY1', 'AH0', 'TH']] } for TEST_WORD in EXPECTED_DICT: EXPECTED_PRONOUNCIATION = EXPECTED_DICT[TEST_WORD] PRONUNCIATION = DICT[TEST_WORD] if EXPECTED_PRONOUNCIATION != PRONUNCIATION: raise AssertionError( 'cmudict.dict(): Expected "{0}", got "{1}".'.format( EXPECTED_PRONOUNCIATION, PRONUNCIATION))
def percentage_of_numbers_to_all_OOD(filename, verbose): n_not_in_CMU = 0 n_only_numbers = 0 n_contain_numbers = 0 only_numbers = [] contain_numbers = [] dict = cmudict.dict() total_songs = 0 if verbose: print('Following words are not in CMU dictionary and ARE NOT NUMBERS:') with open(filename, 'r') as input_file: data = json.load(input_file) for song in data: total_songs += 1 for line in song['lyrics']: last_word = line.strip().split(' ')[-1] last_word = last_word.lower() # Remove punctuation. last_word = re.sub(r"[^\w\d'\s]+", '', last_word) if last_word == '': continue pron = dict.get(last_word) if not pron: if last_word.isdecimal(): n_only_numbers += 1 only_numbers.append(last_word) elif re.search("\d", last_word): n_contain_numbers += 1 contain_numbers.append(last_word) else: if verbose: print(last_word) n_not_in_CMU += 1 if verbose: print("Following words contain a number:") for n in contain_numbers: print(n) print("Following words are numbers:") for n in only_numbers: print(n) print(f'Analyzed {total_songs} songs.') return (n_contain_numbers + n_only_numbers) / n_not_in_CMU, n_only_numbers / n_not_in_CMU
def __init__(self): self.filename = "LCP/lcpr_i.sav" self.cmudict = cmudict.dict() self.wnlp = WonderlicNLP() self.embeddings_index = {} self.wiki_top10 = [ word[0].split()[0] for word in pd.read_csv("LCP/wiki_top10.csv").values ][:10001] self.infersent_model_path = 'LCP/infersent%s.pkl' % 1 self.infersent_model_params = { 'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1 } self.infersent = InferSent(self.infersent_model_params) self.model = RandomForestRegressor(n_estimators=100)
def __init__(self): self.cmudict = cmudict.dict() self.title_bank = None self.folder = os.path.dirname(os.path.realpath(__file__)) # Try reading content for the title_bank try: with open(os.path.join(self.folder, "data", "titles.pickle"), "rb") as f: self.title_bank = pickle.load(f) except FileNotFoundError: from title_scrape import download_gutenberg, gutenberg_preprocess download_gutenberg() gutenberg_preprocess() with open(os.path.join(self.folder, "data", "titles.pickle"), "rb") as f: self.title_bank = pickle.load(f)
class transliterate: try: # Make english dictionary with pronounciation import cmudict english_dict = cmudict.dict() except ModuleNotFoundError: print("Need cmu dictionary with pronounciation. TRY 'pip install cmudict'") raise ModuleNotFoundError def __init__(self, arpa_kata_csv_file_path): # Arpabet syllable pronounciation to katakana dictionary import pandas as pd arpa_kata_df = pd.read_csv(arpa_kata_csv_file_path) self.arpa_kata_dict = {} for ak in arpa_kata_df.values: c1 = ak[0].replace("'", '').strip() v1 = ak[1].strip().split(':') self.arpa_kata_dict[c1] = v1 if not c1[-1].isnumeric(): continue for st in ['0', '1', '2']: if c1[:-1] + st in self.arpa_kata_dict: continue self.arpa_kata_dict[c1[:-1] + st] = v1 # Word break algorithm based on english dictionary @staticmethod def breakings(w): def wordbreak(s, i): if not s[i:]: return [] p = '' a = [] for c in s[i:-1]: p += c i += 1 if not transliterate.english_dict.get(p) or len(transliterate.english_dict.get(p)) == 0: continue a += [[p] + arr for arr in wordbreak(s, i) if arr] p += s[-1] if not transliterate.english_dict.get(p) or len(transliterate.english_dict.get(p)) == 0: return a a += [[p]] return a a = wordbreak(w, 0) return a # Forms a single string based on [1, [2, 3], [5]] => [125, 135] @staticmethod def merger(l): if not l: return [] if all([type(ll) is not list for ll in l]): return l if len(l) == 1: return transliterate.merger(l[0]) b = transliterate.merger(l[0:int(len(l)/2)]) c = transliterate.merger(l[int(len(l)/2):]) if not c: return b if not b: return c return [w1 + w2 for w1 in b for w2 in c] def con_pronounciation_katakana(self, arpa_list): b_1 = [] for c_b_i in range(len(arpa_list)): c_b = arpa_list[c_b_i] b_2 = [] ps = transliterate.english_dict.get(c_b) for p in ps: i = 0 k_w = [] while i<len(p): arpa_key = p[i] if p[i][-1].isnumeric(): i += 1 elif i+1<len(p) and str(p[i+1][-1]).isnumeric(): arpa_key += ' ' + p[i+1] i += 2 elif c_b_i != len(arpa_list)-1 and 'ッ' != self.arpa_kata_dict[p[i]][0] and i == len(p)-1: k_w.append(['ッ']) i += 1 else: i += 1 k_w.append(self.arpa_kata_dict[arpa_key]) b_2 += transliterate.merger(k_w) b_1.append(b_2) return b_1 def english_to_katakana(self, english_words): english_kata_dict = {} ii = 0 for w in english_words: english_kata_dict[w] = set() ii += 1 if ii % 50 == 0: print(ii) w_splits = [[w]] if transliterate.english_dict.get(w) and len(transliterate.english_dict.get(w)) != 0 else transliterate.breakings(w) max_split = 0 max_split_arr = [] for f in w_splits: ff = [w for w in f if len(w) > 2] if len(''.join(ff)) > max_split: max_split = len(''.join(ff)) max_split_arr = [ff] elif len(''.join(ff)) == max_split: max_split_arr.append(f) for b in max_split_arr: b_1 = self.con_pronounciation_katakana(b) english_kata_dict[w] = english_kata_dict[w].union(transliterate.merger(b_1)) return english_kata_dict
prev_vowel = True else: prev_vowel = False # y at the end of the word adds one syllable to the if word[-1] == 'y' and not prev_vowel: count += 1 # e at the end of the word does is usually silent if word[-1] == 'e' and not prev_vowel: count -= 1 return count rhyming_dict = cmudict.dict() # My function for determining if two words rhyme def is_rhyme(a, b): vowels = ['a', 'e', 'i', 'o', 'u'] test_a = rhyming_dict[a] test_b = rhyming_dict[b] last_a = 0 last_b = 0 # This is when either the first or second word are not in the CMU rhyming dictionary # In this case I say they rhyme if the last syllable in each word is the same if len(test_a) == 0 or len(test_b) == 0: for i in range(len(a) - 1, -1, -1): if a[i] in vowels:
import cmudict from g2p.data import DoubleBets a2a_dataset = [[ DoubleBets.alphabet.tseq2iseq(alphaseq), DoubleBets.arpabet.tseq2iseq(arpaseq) ] for k, vs in cmudict.dict().items() for alphaseq, arpaseq in [(k, v) for v in vs]]
import cmudict vowels = ["A", "E", "I", "O", "U"] the_dict = cmudict.dict() def get_syllables(word): """Get the syllables from looking up the CMU dict""" try: phonems = the_dict[word] count = 0 for p in phonems[0]: if (p[0] in vowels): count += 1 return count except Exception: count = _syllables(word) return count def _syllables(word): """Otherwise, if the CMU dict does not contain the entries, manually check the syllables, this is not completely correct, however, since there are no true standards beyond the standard dictionary,this is our assumption that the general factoring rules apply simply on several rules""" vowels = 'aeiouAEIOU' count = 0 word = line.rstrip() if len(word) <= 3: """if the word is small, if the word ends with y and starts with a vowel
def __init__(self, df_lyrics): self.df_lyrics = df_lyrics self.syllables = cmudict.dict()
import pandas as pd import re import numpy as np import cmudict import textdistance import sklearn import pickle import nltk # Load vocab # file = open('vocab', 'rb') # vocab = pickle.load(file) unigrams_df = pd.read_csv('unigram_freq.csv', index_col='word') pron_dict = cmudict.dict() # The class for Word class Word: # Constructor def __init__(self, token): self.token = re.sub(r'\W+', '', str(token)).lower() # Basic features def length(self): if not self.token: return 0 return len(self.token) def frequency(self): try: return np.log(unigrams_df.loc[self.token]['count']) except:
class CANLTK: stops = stopwords.words('english') arpabet = cmudict.dict() twitter_tokenizer = TweetTokenizer() @staticmethod def n_lower_chars(string): return sum(map(str.islower, string)) @staticmethod def n_upper_chars(string): return sum(map(str.isupper, string)) @staticmethod def n_isspace_chars(string): return sum(map(str.isspace, string)) @staticmethod def n_vowels_chars(string): return sum(map(string.lower().count, "aeiou")) @staticmethod def n_count_and_print_alphabets(CONFIG, string): CONFIG['feature_count'] d = {} uppercaseAlphabetArray = [ "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z" ] lowercaseAlphabetArray = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z" ] # print(len(uppercaseAlphabetArray)) # print(len(lowercaseAlphabetArray)) for i in range(26): combo = uppercaseAlphabetArray[i] + lowercaseAlphabetArray[i] CONFIG['feature_count'] += 1 d["f" + (str(CONFIG['feature_count'])) + "-" + combo] = len( re.findall('[' + combo + ']', string)) return d @staticmethod def n_special_chars(string): return sum(map(string.lower().count, ".,?!<>@#$%&()[]:;\'\"")) @staticmethod def n_count_and_print_special_chars(CONFIG, string): CONFIG['feature_count'] d = {} i = 0 specialChars = [ ".", ",", "—", "–", "’", "‘", "?", "!", "<", ">", "@", "#", "$", "%", "&", "(", ")", "[", "]", ":", ";", "\'", "\"" ] specialCharsNamed = [ "fullstop", "comma", "em-dash", "en-dash", "right-single-quotation-mark", "left-single-quotation-mark", "question-mark", "exclamation", "less-than-sign", "greate-than-sign", "at-sign", "hash", "dollar", "percentage", "ampersand", "open-brackets", "closing-brackets", "open-sq-brackets", "close-sq-brackets", "colan", "semi-colan", "single-quotes", "double-quotes" ] for c in specialChars: CONFIG['feature_count'] += 1 d["f" + str(CONFIG['feature_count']) + "-p-" + specialCharsNamed[i]] = len(re.findall('[' + c + ']', string)) i += 1 return d @staticmethod def extract_emojis(str): return ''.join(c for c in str if c in emoji.UNICODE_EMOJI) @staticmethod def n_long_words(wordTokens): filteredTokens = list(filter(lambda x: len(x) > 6, wordTokens)) return len(filteredTokens) @staticmethod def n_words_le_3(wordTokens): filteredTokens = list(filter(lambda x: len(x) <= 3, wordTokens)) return len(filteredTokens) @staticmethod def n_words_le_2(wordTokens): filteredTokens = list(filter(lambda x: len(x) <= 2, wordTokens)) return len(filteredTokens) @staticmethod def avg_words(wordTokens): count = 0 for token in wordTokens: count += len(token) if len(wordTokens) != 0: return count / len(wordTokens) else: return 0 @staticmethod def n_lowercase_sentences(sentTokens): count = 0 for token in sentTokens: if token[0].islower(): count += 1 return count @staticmethod def n_uppercase_sentences(sentTokens): count = 0 for token in sentTokens: if token[0].isupper(): count += 1 return count @staticmethod def n_each_emoticons(string): d = {} # pylint: disable=unused-variable for key, value in emot.EMOTICONS.items(): d[key] = string.count(key) return d @staticmethod def n_each_emojis(string): d = {} # pylint: disable=unused-variable for key, value in emoji.UNICODE_EMOJI.items(): d[key] = string.count(key) return d @staticmethod def n_misspelled_words(sentTokens): spell = SpellChecker() # single proper punctuation isn't regarderded as mispelled multiple false punctuation is considered as mispelled # misspelled = spell.unknown(['somessthing', 'is', 'hapenning', 'here', '!', '👍', ',', ':)']) misspelled = spell.unknown(sentTokens) return len(misspelled) @staticmethod def n_total_punctuations(text): count = 0 for p in string.punctuation: count += text.count(p) return count @staticmethod def print_n_each_punctuation(CONFIG, text): CONFIG['feature_count'] # pylint: disable=unused-variable d = {} count = 0 i = 0 specialCharsNamed = [ "exclamation", "double-quotes", "hash", "dollar", "percentage", "ampersand", "single-quotes", "open-brackets", "closing-brackets", "asterix", "plus", "comma", "dash", "fullstop", "slash", "colan", "semi-colan", "less-than-sign", "equal-sign", "greater-than-sign", "question-mark", "at-sign", "open-sq-brackets", "back-slash", "close-sq-brackets", "caret", "underscore", "grave-accent", "open-curly-brace", "vertical-bar", "close-curly-brace", "tilde" ] for p in string.punctuation: CONFIG['feature_count'] += 1 d["f" + str(CONFIG['feature_count']) + "-p-" + specialCharsNamed[i]] = text.count(p) i += 1 return d @staticmethod def avg_syllables(sentTokens): # print(sentTokens) # emoticons like O.o scews the data up, also Okay👍👍🏿 as a single word so doesnt count syllables there need to cleanse and send validWords = 0 count = 0 for token in sentTokens: token = token.lower() if token in CANLTK.arpabet: # print(token) # print(arpabet[token]) count += len(CANLTK.arpabet[token][0]) validWords += 1 if validWords != 0: return count / validWords else: return 0 # https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules @staticmethod def n_punctuation(string): # need to cleanse emojis count = 0 for c in string: if "P" in unicodedata.category(c): # print(c, unicodedata.category(c)) count += 1 return count @staticmethod def prune_emojis_emoticons(string): # at tim doesn't work specially when emoticons comes after a weird emoji like O.o which isnt registered if "location" in emot.emoji(string).keys() is not None: for loc in reversed(emot.emoji(string)['location']): string = string[0:loc[0]] + string[loc[1] + 1::] # print(emot.emoticons(string)) if "location" in emot.emoticons(string): for loc in reversed(emot.emoticons(string)['location']): string = string[0:loc[0]] + string[loc[1] + 1::] return string @staticmethod def n_function_words(string): count = 0 for f in string: if f in CANLTK.stops: count += 1 return count @staticmethod def n_context_words(string): count = 0 for f in string: if f not in CANLTK.stops: count += 1 return count # def count__each_most_common_words(string): # d = {} # freq_dist = FreqDist(wordTokens) # for li in freq_dist.most_common(10): # li @staticmethod def n_total_emoticons(string): if any("value" in d for d in emot.emoticons(string)): return len(emot.emoticons(string)["value"]) else: return 0 @staticmethod def n_total_emojis(string): if any("value" in d for d in emot.emoji(string)): return len(emot.emoji(string)["value"]) else: return 0 @staticmethod def prune_punctuations_special_characters(text_arr): # need to recorrect for text in text_arr: if text in string.punctuation: text_arr.remove(text) return text_arr @staticmethod def prune_function_words(string_arr): items_to_remove = [] for string in string_arr: string_lowercase = string.lower() if string_lowercase in CANLTK.stops: items_to_remove.append(string) for rm in items_to_remove: string_arr.remove(rm) return string_arr @staticmethod def n_words(string, string_arr): count = 0 # for f in string: if string in string_arr: count += 1 return count @staticmethod def replace_keys(string): return string.replace("!", "exclamation").replace( "\"", "double-quotes" ).replace("$", "dollar").replace("%", "percentage").replace( "&", "ampersand").replace("'", "single-quotes").replace( "(", "open-brackets").replace(")", "closing-brackets").replace( "*", "asterix").replace("+", "plus").replace( ",", "comma").replace("-", "dash").replace( ".", "fullstop").replace("/", "slash").replace( ":", "colan").replace(";", "semi-colan").replace( "<", "less-than-sign" ).replace(">", "greater-than-sign").replace( "=", "equal-sign" ).replace("?", "question-mark").replace( "@", "at-sign" ).replace("[", "open-sq-brackets").replace( "\\", "back-slash" ).replace("]", "close-sq-brackets").replace( "^", "caret" ).replace("_", "underscore").replace( "`", "grave-accent").replace( "{", "open-curly-brace").replace( "|", "vertical-bar").replace( "}", "close-curly-brace" ).replace("~", "tilde").replace( "#", "hash").replace( "•", "bullet-sign").replace( " ", "-") @staticmethod def get_top_from_dictionary(dictionary, min_prune_value=1): items_to_remove = [] for key, value in dictionary.items(): if value < min_prune_value: items_to_remove.append(key) for item in items_to_remove: del dictionary[item] return dictionary @staticmethod def misspelled_word_list(CONFIG, word_tokens): misspelled_freq_dist = {} spell = SpellChecker() # words like I'll I'm are indicated as misspelt, this is fine though that means that person spells it in that particular manner and is beieng flagged # misspelled = spell.unknown(['somessthing', 'is', 'hapenning', 'here', '!', '👍', ',', ':)', 'somessthing']) # misspelled = spell.unknown(['hamzas', 'emails', "I'll", "I'm"]) misspelled = spell.unknown(word_tokens) word_tokens_lowercase = [] for w in word_tokens: word_tokens_lowercase.append(w.lower()) for w in misspelled: misspelled_freq_dist[w] = word_tokens_lowercase.count(w) # sorted(misspelled_freq_dist.items(), key=lambda x: x[1], reverse=True) misspelled_freq_dist = CANLTK.get_top_from_dictionary( misspelled_freq_dist, CONFIG['MISSPELLED_MIN_PRUNE_VALUE']) return misspelled_freq_dist # Old method # line = f2.readline().rstrip('\n') # while line: # generate_values(line) # line = f2.readline().rstrip('\n') # f2.close() # def preprocessing_fn(inputs): # modified_inputs = {} # for key, value in inputs.items(): # modified_inputs[key] = tft.scale_to_0_1(value) # return modified_inputs # def normalize_data(data): # # Ignore the warnings # fieldnames = data[0].keys() # datatypes = {} # for fn in fieldnames: # datatypes[fn] = tf.io.FixedLenFeature([], tf.float32) # raw_data_metadata = dataset_metadata.DatasetMetadata(dataset_schema.from_feature_spec(datatypes)) # with tft_beam.Context(temp_dir=tempfile.mkdtemp()): # transformed_dataset, transform_fn = ( # pylint: disable=unused-variable # (data, raw_data_metadata) | tft_beam.AnalyzeAndTransformDataset( # preprocessing_fn)) # transformed_data, transformed_metadata = transformed_dataset # pylint: disable=unused-variable # return transformed_data @staticmethod def lexical_ttr(tokens): return lex_div.ttr(tokens) @staticmethod def lexical_msttr(tokens): return lex_div.msttr(tokens) @staticmethod def lexical_mattr(tokens): return lex_div.mattr(tokens) @staticmethod def lexical_hdd(tokens): return lex_div.hdd(tokens) @staticmethod def lexical_mtld(tokens): return lex_div.mtld(tokens) @staticmethod def n_word_extensions(tokens): count = 0 for token in tokens: try: corrected_word = pytypo.correct(token) if not (corrected_word == token): count = count + 1 except: ex = 1 return count @staticmethod def most_common_word_extensions(CONFIG, tokens): fd = nltk.FreqDist(tokens) extend_word_list_dict = [] for k, v in fd.items(): try: corrected_word = pytypo.correct(k) if not (corrected_word == k): extend_word_list_dict.append({"word": k, "count": v}) except: ex = 1 extend_word_list_dict_sorted = sorted(extend_word_list_dict, key=lambda k: k['count'], reverse=True) top_extend_word_list_dict = extend_word_list_dict_sorted[:int( CONFIG['NO_OF_MOST_FREQ_WORD_EXTENSIONS'])] return top_extend_word_list_dict @staticmethod def count_each_word_extension(extend_word_list_dict, tokens): each_extend_word_list_dict = [] for extended_word_d in extend_word_list_dict: count = 0 if extended_word_d["word"] in tokens: count = count + 1 each_extend_word_list_dict.append({ "word": extended_word_d["word"], "count": count }) return each_extend_word_list_dict @staticmethod def most_common_bigrams(CONFIG, tokens): #Create your bigrams bgs = nltk.bigrams(tokens) #compute frequency distribution for all the bigrams in the text bigrams_freq_dist = nltk.FreqDist(bgs) top_list = list( filter(lambda x: x[1] >= int(CONFIG['BIGRAMS_MIN_PRUNE_VALUE']), bigrams_freq_dist.items())) converted_list = [] for i in top_list: converted_list.append({"ngrams": list(i[0]), "count": i[1]}) return converted_list @staticmethod def most_common_trigrams(CONFIG, tokens): #Create your trigrams tgs = nltk.trigrams(tokens) #compute frequency distribution for all the trigrams in the text trigrams_freq_dist = nltk.FreqDist(tgs) top_list = list( filter(lambda x: x[1] >= int(CONFIG['TRIGRAMS_MIN_PRUNE_VALUE']), trigrams_freq_dist.items())) converted_list = [] for i in top_list: converted_list.append({"ngrams": list(i[0]), "count": i[1]}) return converted_list @staticmethod def count_each_ngrams(tokens, freq_dist_dict): ngs = nltk.bigrams(tokens) ngrams_freq_dist = nltk.FreqDist(ngs) ngram_freq_dist_count = [] for li in freq_dist_dict: freq_count = 0 for bigram, count in ngrams_freq_dist.items(): if bigram == li: freq_count = count break ngram_freq_dist_count.append({ "ngrams": li["ngrams"], "count": freq_count }) return ngram_freq_dist_count @staticmethod def n_grammar_errors(string): tool = language_check.LanguageTool('en-US') matches = tool.check(string) count = 0 for m in matches: if m.category == "Grammar": count = count + 1 return count @staticmethod def normalize_data(data): raw = [] for i in data: r2 = [] for k, v in i.items(): r2.append(v) raw.append(r2) # r_normalized = preprocessing.normalize(raw, norm='l2',axis=0) min_max_scaler = preprocessing.MinMaxScaler() r_normalized = min_max_scaler.fit_transform(raw) normalized_list = [] for val in r_normalized: normalized_dict = {} heading_names = list(data[0]) for index, inner_val in enumerate(val): normalized_dict[heading_names[index]] = inner_val normalized_list.append(normalized_dict) return normalized_list # main() # write code to generate negative results for analysis using same params # ADDITIONAL REFERENCES # https://pc.net/emoticons/ # TODO do CBOW for additional accuracy # TODO use LIWC API or maybe develop some of those features http://www.utpsyc.org/TAT/LIWCTATresults.php https://liwc.wpengine.com/compare-dictionaries/ # TODO https://github.com/LSYS/lexicalrichness https://sp1718.github.io/nltk.pdf # TODO https://github.com/adeshpande3/LSTM-Sentiment-Analysis # # TODO Setiment Analysis # https://github.com/axelnine/Sentiment-Analysis # https://github.com/shubhi-sareen/Sentiment-Analysis # https://github.com/ian-nai/Simple-Sentiment-Analysis # https://github.com/changhuixu/sentiment-analysis-using-python # https://textblob.readthedocs.io/en/dev/quickstart.html#sentiment-analysis # https://github.com/MohamedAfham/Twitter-Sentiment-Analysis-Supervised-Learning # TODO Lexical richness # https://pypi.org/project/lexicalrichness/ # TODO Typo and word extensions # https://pypi.org/project/pytypo/ # https://stackoverflow.com/questions/20170022/elongated-word-check-in-sentence # TODO british or american enlgish # https://datascience.stackexchange.com/questions/23236/tokenize-text-with-both-american-and-english-words # https://stackoverflow.com/questions/42329766/python-nlp-british-english-vs-american-english # https://github.com/hyperreality/American-British-English-Translator
# -*- coding: utf-8 -*- """ @author: Dan """ import nltk import sys import cmudict from string import punctuation import json cmudict = cmudict.dict() path = (r'Desktop\python code') file = 'txt cmu.txt' f = open(r'C:\Users\Dan\Desktop\python code\zip\train.txt') text = f.read() with open('missing_words.json') as f: missing_words = json.load(f) def count_syllables(words): words = words.replace('-', ' ') words = words.lower().split() num_sylls = 0 for word in words: word = word.strip(punctuation) if word.endswith("'s") or word.endswith("’s"):
def get_syllables(word): d = cmudict.dict() return [ len(list(y for y in x if y[-1].isdigit())) for x in d[word.lower()] ]
def lookup_word(word_s): return cmudict.dict().get(word_s) # standard dict access
Input: input_phonemes List of the phonemes that make up the word Output: syllable_count Number of syllables for that word """ def calculate_number_of_syllables(input_phonemes): phonemes_string = "".join(input_phonemes[0]) syllable_count = 0 for letter in phonemes_string: if (letter.isdigit()): syllable_count += 1 return syllable_count if __name__ == "__main__": pronunciation_dictionary = cmudict.dict() word_count = 0 output_dictionary = {} start_time = time.time() for key, value in pronunciation_dictionary.items(): # Check if it has valid letters (Ignore words with apostrophe's in them) is_valid = True for index in key: if (not index.islower()): is_valid = False break number_of_syllables = calculate_number_of_syllables(value) # Ignore unwanted words
# -*- coding: utf-8 -*- """ @author: Dan """ import nltk import sys import json import cmudict from string import punctuation cmu=cmudict.dict() path=(r'Desktop\python code') file='txt cmu.txt' f=open( r'C:\Users\Dan\Desktop\python code\zip\train.txt') text=f.read() def main(): haiku= load_haiku(text) exceptions= cmuduct_missing(haiku) missing_words_dict= make_exceptions_dict(exceptions) save_exceptions(missing_words_dict) def load_haiku(file): haiku=set(in_file.read().replace('-',' ').split())
import cmudict from functools import reduce import re import numpy as np from lib.features import features, feature_weights # We store the cmudict as an object in memory so that we don't have to reload # it every single time we call word_to_phonemes. cmudict_cache = cmudict.dict() # Maps from diphthongs to their monophthonic parts. Also "ER" for no reason. diphthong_pairs = { "AW": ["AE", "UH"], "AY": ["AE", "IH"], "ER": ["R"], "EY": ["E", "IH"], "OW": ["O", "UH"], "OY": ["AO", "IH"], } def expand_phoneme(phoneme): """ Expands a phoneme to potentially multiple phonemes. Used to map diphthongs to its monophthongs in series. """ return diphthong_pairs.get(phoneme, [phoneme]) def word_to_phonemes(word):
from text.cmuToKorean import CMUToKorean import cmudict # dup in this dir ret = CMUToKorean.convert('seventeen', " ".join(cmudict.dict()["seventeen"][0])) print(ret)
#!/usr/bin/env python from utils import rhyme_pattern, slant_distance, n_syllables if __name__ == "__main__": """ Create neo4j database to house the words and patterns. """ import os import cmudict from neo4j import GraphDatabase from itertools import product D = cmudict.dict() USERNAME = os.environ["NEO4J_USERNAME"] PASSWORD = os.environ["NEO4J_PASSWORD"] URI = os.environ["NEO4J_URI"] driver = GraphDatabase.driver(URI, auth=(USERNAME, PASSWORD)) with open('words.txt', 'r') as fh: words = fh.read().strip().split('\n') with driver.session() as session: session.run("MATCH (n) DETACH DELETE n;") for word in words: for pron_list in D[word]: pron_str = ' '.join(pron_list) rp_pron_str = rhyme_pattern(pron_str)
def __init__(self, lang='en'): if lang == 'en': self.cmu_dict = cmudict.dict()