def __init__(self, grammar, length=1): """Convert the grammar to Chomsky Normal Form and do preprocessing. `grammar` can be: (1) an instance of nltk.grammar.ContextFreeGrammar, (2) a string representing the path to a .cfg file, or (3) a string that can be parsed into a grammar by parse_cfg `length` is the maximum string length that should be preprocessed. """ if length < 1: raise ValueError('length must be greater than 0.') # self.grammar must be instance of nltk.grammar.Grammar if isinstance(grammar, ContextFreeGrammar): self.grammar = grammar elif isinstance(grammar, str) and grammar.endswith('.cfg'): self.grammar = nltk.data.load('file:' + grammar) elif isinstance(grammar, str): self.grammar = parse_cfg(grammar) else: raise ValueError('Arg grammar must be nltk.grammar.Grammar or str.') if not self.grammar.is_chomsky_normal_form(): #raise ValueError('Input grammar must be in CNF ' # '(conversion method isn\'t implemented)') self.grammar = convert_to_cnf(self.grammar) assert self.grammar.is_chomsky_normal_form() self.productions = self.grammar.productions() # TODO: Is it ok to assume all nonterminals occur on a LHS? # Technically yes, but check whether nltk's is_cnf ensures it. self.nonterminals = set([p.lhs() for p in self.productions]) self.terminals = set([token for prod in self.productions for token in prod.rhs() if not isinstance(token, Nonterminal)]) # Initialize self._counts then populate it in _preprocess(). # self.length is the string length that has been preprocessed. self._counts = {} self.length = 0 self._preprocess(length)
VOWEL -> MONOPHTHONG | DIPHTHONG MONOPHTHONG -> 'ɔ' | 'ɑ' | 'i' | 'u' | 'ɛ' | 'ɪ' | 'ʊ' | 'ʌ' | 'ə' | 'æ' DIPHTHONG -> 'eɪ' | 'aɪ' | 'oʊ' | 'aʊ' | 'ɔɪ' GLIDE -> 'j' | 'w' SYLLABIC_CONSONANT -> 'm̩' | 'n̩' | 'ɹ̩' | 'ɫ̩' NASAL -> 'm' | 'n' | 'ŋ' LIQUID -> 'ɹ' | 'l' FRICATIVE -> 'f' | 'v' | 'θ' | 'ð' | 's' | 'z' | 'ʃ' | 'ʒ' STOP -> 'p' | 'b' | 't' | 'd' | 'k' | 'g' AFFRICATE -> 'tʃ' | 'dʒ' """) grammar = cnf.convert_to_cnf(non_cnf_grammar) assert grammar.is_chomsky_normal_form() generator = cfl.CFLGenerator(grammar) def main(args): if args: num_phones = int(args[0]) else: num_phones = 50 print ''.join(generator.generate(num_phones)) if __name__ == '__main__': main(sys.argv[1:])