Exemple #1
0
    def number_to_words(self, number: str) -> List[str]:
        """Return a list of possible word matches for the given number."""
        # pronouncing.pronunciations is a list
        # pronouncing.lookup is a dict mapping of the word to the phonemes

        # 83 should match "FM" and "VM"

        # FIXME: This whole implementation is basically a special form of
        #        pronouncing.search() and hence needs to call .init_cmu(). Ideally this
        #        detail is not needed.
        pronouncing.init_cmu()

        pattern = "^"
        for n in map(int, re.findall(r"\d", number)):
            pattern += f"({'|'.join(self.MAPPING[n])})"
        pattern += "$"

        matcher = re.compile(pattern)

        matches = []
        for word, phonemes in pronouncing.pronunciations:
            consts = "".join(
                filter(lambda p: p in self.phonemes2num.keys(),
                       phonemes.split()))
            if matcher.match(consts):
                matches.append(word)

        return matches
Exemple #2
0
def load():
    pr.init_cmu()  # pronouncing loads data on demand, so let's demand it

    feature_vocab, feature_vocab_idx_map = phoneme_feature_vocab()
    letter_vocab, letter_vocab_idx_map = orthography_vocab()
    max_phone_len = max_phoneme_feature_len()
    max_letter_len = max_orthography_len()

    # TODO: held-out data here

    letter_data = np.zeros(
        (len(pr.pronunciations), max_letter_len, len(letter_vocab)),
        dtype=np.float32)
    phonfeat_data = np.zeros(
        (len(pr.pronunciations), max_phone_len, len(feature_vocab)),
        dtype=np.float32)
    letter_target_data = np.zeros(
        (len(pr.pronunciations), max_letter_len, len(letter_vocab)),
        dtype=np.float32)
    phonfeat_target_data = np.zeros(
        (len(pr.pronunciations), max_phone_len, len(feature_vocab)),
        dtype=np.float32)

    for i, (word, phones) in enumerate(pr.pronunciations):

        # orthography: one-hot for each character index
        word = "^" + word + "$"
        for t, char in enumerate(word):
            letter_data[i, t, letter_vocab_idx_map[char]] = 1.
            if t > 0:
                letter_target_data[i, t - 1, letter_vocab_idx_map[char]] = 1.

        # clean errant comments
        phones = re.sub(' #.*$', '', phones)

        # phonemes: multi-label k-hot phonetic features
        for t, phone in enumerate(["^"] + phones.split() + ["$"]):
            for ft in featurephone.phone_feature_map[phone.strip('012')] + \
                    (('str',) if re.search(r'[12]$', phone) else tuple()):
                phonfeat_data[i, t, feature_vocab_idx_map[ft]] = 1.
                if t > 0:
                    phonfeat_target_data[i, t - 1,
                                         feature_vocab_idx_map[ft]] = 1

    return {
        'phoneme_feature_vocab': feature_vocab,
        'phoneme_feature_idx_map': feature_vocab_idx_map,
        'orthography_vocab': letter_vocab,
        'orthography_idx_map': letter_vocab_idx_map,
        'phoneme_feature_data': phonfeat_data,
        'phoneme_feature_target_data': phonfeat_target_data,
        'orthography_data': letter_data,
        'orthography_target_data': letter_target_data
    }
Exemple #3
0
def build_db():
    pronouncing.init_cmu()
    build_word_freq()
    for word, phones in pronouncing.pronunciations:
        if len(word) < 3:
            continue
        if filter_re.search(word):
            continue
        if word not in WORD_FREQ:
            continue
        match = splitter.match(phones)
        if not match:
            continue
        start, end = match.groups()
        rank = WORD_FREQ.get(word, 10000)
        t = (rank, start, end, word)
        BY_END[end].add(t)
        BY_START[start].add(t)
        BY_PHONES[phones].add(t)
Exemple #4
0
 def _compile_data(self):
     clean_word = lambda word: ''.join(c for c in word if c.isalpha())
     pronouncing.init_cmu()
     chars = set()
     vocab = sorted(map(clean_word, pronouncing.lookup.keys()))
     self.vocab = random.sample(vocab, 20000)
     suffix_index = collections.defaultdict(set)
     for word in pronouncing.lookup.keys():
         for suffix_len in (3, 4, 5):
             suffix_index[word[-min(suffix_len, len(word)):]].add(word)
     pos_pairs, neg_pairs = set(), set()
     for j, word in enumerate(self.vocab):
         assert word
         rhymes_set = set(pronouncing.rhymes(word))
         rhymes = list(rhymes_set)
         if rhymes:
             for rhyme in rhymes:
                 pos_pairs.add((word, rhyme, 1))
                 chars.update(set(list(word) + list(rhyme)))
             non_rhymes, neighbors = set(), set()
             for suffix_len in (3, 4, 5):
                 neighbors = neighbors.union(
                     suffix_index[word[-min(suffix_len, len(word)):]])
             neighbors = list(neighbors)
             random.shuffle(neighbors)
             for neighbor in neighbors:
                 if len(non_rhymes) >= (len(rhymes) / 2):
                     break
                 if neighbor != word and neighbor not in rhymes_set:
                     non_rhymes.add(neighbor)
             while len(non_rhymes) < len(rhymes):
                 non_rhyme = random.sample(vocab, 1)[0]
                 if non_rhyme not in rhymes_set and non_rhyme != word:
                     non_rhymes.add(non_rhyme)
             assert len(non_rhymes) == len(rhymes)
             for negative in non_rhymes:
                 neg_pairs.add((word, negative, 0))
                 chars.update(set(list(negative)))
     return list(pos_pairs.union(neg_pairs)), chars
Exemple #5
0
import scrape
import pronouncing
import itertools
import random

subreddit = "writingprompts"
num_comments = 2000
haiku_syllable_limits = [5, 7, 5]

pronouncing.init_cmu()
allowed_words = frozenset(map(lambda x: x[0], pronouncing.pronunciations))


def count_syllables(word):
    if word == "EOF" or word == "SOF":
        return 0
    else:
        phones = pronouncing.phones_for_word(word)
        return pronouncing.syllable_count(phones[0])


print("Setting up...")
# only has next word and counts for each word - not raw probabilities
base_chains = dict()
for chain in scrape.get_raw_chains(subreddit, num_comments, allowed_words):
    chain.insert(0, "SOF")
    chain.append("EOF")
    for index, word in itertools.islice(enumerate(chain), len(chain) - 1):
        next_word = chain[index + 1]
        if word in base_chains:
            if next_word in base_chains[word]:
import sys

import pronouncing
import editdistance

pronouncing.init_cmu()

def phones_for_closest_match(word):
    """Brute force. Look for lowest distance between all words that are in
    the CMU dictionary.

    """
    by_distance = []
    for possibility in pronouncing.pronunciations:

        # levenstein
        distance = editdistance.eval(possibility, word)

        # give a bonus for same first letter / last letter
        if possibility.startswith(word[0]):
            distance -= 1
        if possibility.endswith(word[-1]):
            distance -= 1 

        # break ties with difference in length
        character_difference = abs(len(possibility) - len(word))
        by_distance.append((distance, character_difference, possibility))

    # find the lowest (final tie breaker is alphabetical, oh well)
    d_edit, d_length, suggestion = min(by_distance)
import pronouncing as p
from random import choice
from flask import Flask

app = Flask(__name__)

p.init_cmu()
swear = 'twat,fart,balls,snatch,pecker,dork,poon,dingle,tit,suck,snot,shit,piss,f**k,c**t,dick,c**k,wad,cum,j**z,crap,pussy,f*g,s**t,douche,ass'.split(
    ',')

two_syllable = []
ing = []

for w in p.pronunciations:
    first_syl = w[1].split(' ')[0]

    gerund = True if w[1].endswith('IH0 NG') else False

    pho = [syl for syl in w[1].split(' ') if len(syl) == 3]
    if not pho:
        continue

    if pho[0][-1] == '1' and len(pho) == 2 and w[0][-1] != 's' and len(
            first_syl) == 1 and not gerund:
        two_syllable.append(w)

    if len(pho) == 2 and gerund and len(first_syl) == 1:
        ing.append(w)


def make_word():
Exemple #8
0
from sklearn.model_selection import train_test_split
import numpy as np
import pincelate.featurephone as featurephone
import pronouncing as pr
import re
import itertools

# pronouncing loads data on demand, so let's demand it
pr.init_cmu()


def phoneme_feature_vocab():
    feat_vals = set(itertools.chain(*featurephone.phone_feature_map.values()))
    all_vals = list(feat_vals) + ["str"]  # "str" used to mark stressed vowels
    feature_vocab = sorted(all_vals)
    feature_vocab_idx_map = {k: i for i, k in enumerate(feature_vocab)}
    return feature_vocab, feature_vocab_idx_map


def max_phoneme_feature_len():
    # +2 to account for beginning/ending tokens
    return max((len(p.split()) for w, p in pr.pronunciations)) + 2


def orthography_vocab():
    letters = set(itertools.chain(*[list(a) for a, b in pr.pronunciations]))
    # ensure vocab item 0 is end of string
    letter_vocab = ["$", "^"] + list(sorted(letters))
    letter_vocab_idx_map = {k: i for i, k in enumerate(letter_vocab)}
    return letter_vocab, letter_vocab_idx_map