-
Notifications
You must be signed in to change notification settings - Fork 0
/
min_pairs.py
116 lines (95 loc) · 4 KB
/
min_pairs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from collections import defaultdict
from nltk.corpus import cmudict
# Convenience methods
def get_rhymes(word):
"""Get all rhymes of a word.
Returns a list of (pronunciation, rhymes) tuples.
"""
return list(MinPairFinder.get_instance().word_to_rhymes(word))
class Singleton(object):
_instance = None
@classmethod
def get_instance(cls):
if not cls._instance:
cls._instance = cls()
return cls._instance
class MinPairFinder(Singleton):
_dict = None
_rhymes_dict = None
def __init__(self):
"""Instantiate a MinPairFinder.
Note; To avoid unnecessarily repeating the work of loading the dict,
call `get_instance` instead.
"""
if not self._dict:
self._dict = cmudict.dict()
if not self._rhymes_dict:
self._rhymes_dict = self._get_rhymes_dict()
def _get_rhymes_dict(self):
"""Get the cmudict as a rhymes dictionary.
This maps similar phonemes to each other, skipping the first phoneme.
You end up with a dict like
{('AO1', 'G', 'Z'): ['dogs', 'logs', ...], ...}
"""
rhymes = defaultdict(list)
for (word, pronunciations) in self._dict.iteritems():
for pronunciation in pronunciations:
rhymes[tuple(pronunciation[1:])].append(word)
return rhymes
def word_to_rhymes(self, word):
"""Get all rhymes of a word, for all pronunciations.
Ards:
word: The word you want rhymes for.
Yields (pronunciation, [rhymes]) tuples
"""
for pronunciation in self._dict[word]:
key = tuple(pronunciation[1:])
yield (pronunciation, self._rhymes_dict[key])
def min_pairs(self, word, min_len=2, max_len=5):
"""Find all minimum pairs of the given word.
Args:
word: The word you want minimal pairs for.
min_len: The minimum length of the min-pair words. Default 1.
max_len: The maximum length of the min-pair rhyming words.
Default 100.
"""
pairs = []
for pronunciation, rhymes in self.word_to_rhymes(word):
pairs.extend(rhyme for rhyme in rhymes
if min_len <= len(rhyme) <= max_len)
return pairs
def words_starting_with(self, letter):
"""Find all words starting with the given letter or letters."""
for word in self._dict.iterkeys():
if word.startswith(letter):
yield word
def get_prefix_variants(self, prefixes, min_len=1, max_len=100):
"""Find all rhymes for all words starting with the given prefixes.
Args:
prefixes: An iterable of prefixes. Commonly a string made up of
first-letters, but a list of prefixes works as well.
min_len: The minimum length of the rhyming words. Default 1.
max_len: The maximum length of the rhyming words. Default 100.
"""
pairs = []
needs_prefixes = set(prefixes)
seen_words = set()
for word in self.words_starting_with(prefixes[0]):
for pronunciation in self._dict[word]:
has_prefixes = set()
rhymes = filter(lambda rhyme: min_len <= len(rhyme) <= max_len,
self._rhymes_dict[tuple(pronunciation[1:])])
if len(rhymes) < len(prefixes):
break
# Filter out words that don't start with the desired prefixes
filtered_rhymes = []
for word in rhymes:
for prefix in prefixes:
if word.startswith(prefix) and word not in seen_words:
has_prefixes.add(prefix)
filtered_rhymes.append(word)
seen_words.add(word)
break
if has_prefixes == needs_prefixes:
pairs.append(sorted(filtered_rhymes))
return sorted(pairs)