def number_to_words(self, number: str) -> List[str]: """Return a list of possible word matches for the given number.""" # pronouncing.pronunciations is a list # pronouncing.lookup is a dict mapping of the word to the phonemes # 83 should match "FM" and "VM" # FIXME: This whole implementation is basically a special form of # pronouncing.search() and hence needs to call .init_cmu(). Ideally this # detail is not needed. pronouncing.init_cmu() pattern = "^" for n in map(int, re.findall(r"\d", number)): pattern += f"({'|'.join(self.MAPPING[n])})" pattern += "$" matcher = re.compile(pattern) matches = [] for word, phonemes in pronouncing.pronunciations: consts = "".join( filter(lambda p: p in self.phonemes2num.keys(), phonemes.split())) if matcher.match(consts): matches.append(word) return matches
def load(): pr.init_cmu() # pronouncing loads data on demand, so let's demand it feature_vocab, feature_vocab_idx_map = phoneme_feature_vocab() letter_vocab, letter_vocab_idx_map = orthography_vocab() max_phone_len = max_phoneme_feature_len() max_letter_len = max_orthography_len() # TODO: held-out data here letter_data = np.zeros( (len(pr.pronunciations), max_letter_len, len(letter_vocab)), dtype=np.float32) phonfeat_data = np.zeros( (len(pr.pronunciations), max_phone_len, len(feature_vocab)), dtype=np.float32) letter_target_data = np.zeros( (len(pr.pronunciations), max_letter_len, len(letter_vocab)), dtype=np.float32) phonfeat_target_data = np.zeros( (len(pr.pronunciations), max_phone_len, len(feature_vocab)), dtype=np.float32) for i, (word, phones) in enumerate(pr.pronunciations): # orthography: one-hot for each character index word = "^" + word + "$" for t, char in enumerate(word): letter_data[i, t, letter_vocab_idx_map[char]] = 1. if t > 0: letter_target_data[i, t - 1, letter_vocab_idx_map[char]] = 1. # clean errant comments phones = re.sub(' #.*$', '', phones) # phonemes: multi-label k-hot phonetic features for t, phone in enumerate(["^"] + phones.split() + ["$"]): for ft in featurephone.phone_feature_map[phone.strip('012')] + \ (('str',) if re.search(r'[12]$', phone) else tuple()): phonfeat_data[i, t, feature_vocab_idx_map[ft]] = 1. if t > 0: phonfeat_target_data[i, t - 1, feature_vocab_idx_map[ft]] = 1 return { 'phoneme_feature_vocab': feature_vocab, 'phoneme_feature_idx_map': feature_vocab_idx_map, 'orthography_vocab': letter_vocab, 'orthography_idx_map': letter_vocab_idx_map, 'phoneme_feature_data': phonfeat_data, 'phoneme_feature_target_data': phonfeat_target_data, 'orthography_data': letter_data, 'orthography_target_data': letter_target_data }
def build_db(): pronouncing.init_cmu() build_word_freq() for word, phones in pronouncing.pronunciations: if len(word) < 3: continue if filter_re.search(word): continue if word not in WORD_FREQ: continue match = splitter.match(phones) if not match: continue start, end = match.groups() rank = WORD_FREQ.get(word, 10000) t = (rank, start, end, word) BY_END[end].add(t) BY_START[start].add(t) BY_PHONES[phones].add(t)
def _compile_data(self): clean_word = lambda word: ''.join(c for c in word if c.isalpha()) pronouncing.init_cmu() chars = set() vocab = sorted(map(clean_word, pronouncing.lookup.keys())) self.vocab = random.sample(vocab, 20000) suffix_index = collections.defaultdict(set) for word in pronouncing.lookup.keys(): for suffix_len in (3, 4, 5): suffix_index[word[-min(suffix_len, len(word)):]].add(word) pos_pairs, neg_pairs = set(), set() for j, word in enumerate(self.vocab): assert word rhymes_set = set(pronouncing.rhymes(word)) rhymes = list(rhymes_set) if rhymes: for rhyme in rhymes: pos_pairs.add((word, rhyme, 1)) chars.update(set(list(word) + list(rhyme))) non_rhymes, neighbors = set(), set() for suffix_len in (3, 4, 5): neighbors = neighbors.union( suffix_index[word[-min(suffix_len, len(word)):]]) neighbors = list(neighbors) random.shuffle(neighbors) for neighbor in neighbors: if len(non_rhymes) >= (len(rhymes) / 2): break if neighbor != word and neighbor not in rhymes_set: non_rhymes.add(neighbor) while len(non_rhymes) < len(rhymes): non_rhyme = random.sample(vocab, 1)[0] if non_rhyme not in rhymes_set and non_rhyme != word: non_rhymes.add(non_rhyme) assert len(non_rhymes) == len(rhymes) for negative in non_rhymes: neg_pairs.add((word, negative, 0)) chars.update(set(list(negative))) return list(pos_pairs.union(neg_pairs)), chars
import scrape import pronouncing import itertools import random subreddit = "writingprompts" num_comments = 2000 haiku_syllable_limits = [5, 7, 5] pronouncing.init_cmu() allowed_words = frozenset(map(lambda x: x[0], pronouncing.pronunciations)) def count_syllables(word): if word == "EOF" or word == "SOF": return 0 else: phones = pronouncing.phones_for_word(word) return pronouncing.syllable_count(phones[0]) print("Setting up...") # only has next word and counts for each word - not raw probabilities base_chains = dict() for chain in scrape.get_raw_chains(subreddit, num_comments, allowed_words): chain.insert(0, "SOF") chain.append("EOF") for index, word in itertools.islice(enumerate(chain), len(chain) - 1): next_word = chain[index + 1] if word in base_chains: if next_word in base_chains[word]:
import sys import pronouncing import editdistance pronouncing.init_cmu() def phones_for_closest_match(word): """Brute force. Look for lowest distance between all words that are in the CMU dictionary. """ by_distance = [] for possibility in pronouncing.pronunciations: # levenstein distance = editdistance.eval(possibility, word) # give a bonus for same first letter / last letter if possibility.startswith(word[0]): distance -= 1 if possibility.endswith(word[-1]): distance -= 1 # break ties with difference in length character_difference = abs(len(possibility) - len(word)) by_distance.append((distance, character_difference, possibility)) # find the lowest (final tie breaker is alphabetical, oh well) d_edit, d_length, suggestion = min(by_distance)
import pronouncing as p from random import choice from flask import Flask app = Flask(__name__) p.init_cmu() swear = 'twat,fart,balls,snatch,pecker,dork,poon,dingle,tit,suck,snot,shit,piss,f**k,c**t,dick,c**k,wad,cum,j**z,crap,pussy,f*g,s**t,douche,ass'.split( ',') two_syllable = [] ing = [] for w in p.pronunciations: first_syl = w[1].split(' ')[0] gerund = True if w[1].endswith('IH0 NG') else False pho = [syl for syl in w[1].split(' ') if len(syl) == 3] if not pho: continue if pho[0][-1] == '1' and len(pho) == 2 and w[0][-1] != 's' and len( first_syl) == 1 and not gerund: two_syllable.append(w) if len(pho) == 2 and gerund and len(first_syl) == 1: ing.append(w) def make_word():
from sklearn.model_selection import train_test_split import numpy as np import pincelate.featurephone as featurephone import pronouncing as pr import re import itertools # pronouncing loads data on demand, so let's demand it pr.init_cmu() def phoneme_feature_vocab(): feat_vals = set(itertools.chain(*featurephone.phone_feature_map.values())) all_vals = list(feat_vals) + ["str"] # "str" used to mark stressed vowels feature_vocab = sorted(all_vals) feature_vocab_idx_map = {k: i for i, k in enumerate(feature_vocab)} return feature_vocab, feature_vocab_idx_map def max_phoneme_feature_len(): # +2 to account for beginning/ending tokens return max((len(p.split()) for w, p in pr.pronunciations)) + 2 def orthography_vocab(): letters = set(itertools.chain(*[list(a) for a, b in pr.pronunciations])) # ensure vocab item 0 is end of string letter_vocab = ["$", "^"] + list(sorted(letters)) letter_vocab_idx_map = {k: i for i, k in enumerate(letter_vocab)} return letter_vocab, letter_vocab_idx_map