def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=None, context_length=2, min_sentence_length=5): ''' Initializes the ContextEncoder. The encoder will consider the context of at most context_length previous words to choose the best subsequent word. ''' super(ContextEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict) # with the default encoding, the maximum number of digits in any word is 19 max_digits_per_word = 19 # if we were given an unusual max_word_length, set it to max_digits_per_word if max_word_length == None or max_word_length < 1 or max_word_length > max_digits_per_word: max_word_length = max_digits_per_word self.max_word_length = max_word_length max_reasonable_context = 5 # this limit is arbitrary; 5 was chosen with n-grams in mind # if we were given an unusual context_length, set it to max_reasonable_context if context_length == None or context_length < 0 or context_length > max_reasonable_context: context_length = max_reasonable_context self.context_length = context_length # to aid encode_number(), we set up a mapping from phoneme sequences to words self.phonemes_to_words_dict = self._get_phonemes_to_words_dict() # minimum number of words required per sentence self.min_sentence_length = min_sentence_length
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=None, min_sentence_length=5, n=3, alpha=0.1, select_most_likely=True, tagger=UnigramTagger(brown.tagged_sents())): ''' Initializes the NgramPOSContextEncoder. The encoder will consider the context of at most (n - 1) previous words and choose the subsequent word with highest probability part-of-speech and with highest n-gram probability if select_most_likely is True (otherwise, will sample weighted by probabilities). ''' super(NgramPOSContextEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict, max_word_length=max_word_length, context_length=n - 1, min_sentence_length=min_sentence_length) self.ngram = NgramModel(n, alpha=alpha) self.select_most_likely = select_most_likely self.tagger = tagger self.pos_ngram = NgramPOSModel(n, alpha=alpha)
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None): ''' Initializes the NumberEncoder. If given a pronouncer, uses it. Also, if given a dictionary mapping from phonemes to digits (length-one strings), uses it. ''' self.pronouncer = pronouncer if phoneme_to_digit_dict: self.phoneme_to_digit_dict = phoneme_to_digit_dict else: self.phoneme_to_digit_dict = self._get_phoneme_to_digit_dict()
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=2): ''' Initializes the UnigramGreedyEncoder. The encoder will greedily group digits to make words as long as possible (where the length of a word refers to the number of digits it encodes). ''' super(UnigramGreedyEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict, max_word_length=max_word_length) self.unigram = NgramModel(1)
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=2): ''' Initializes the GreedyEncoder. The encoder will greedily group digits to make words as long as possible (where the length of a word refers to the number of digits it encodes). ''' super(GreedyEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict) # with the default encoding, the maximum number of digits in any word is 19 max_digits_per_word = 19 # if we were given an unusual max_word_length, set it to max_digits_per_word if max_word_length == None or max_word_length < 1 or max_word_length > max_digits_per_word: max_word_length = max_digits_per_word self.max_word_length = max_word_length # to aid encode_number(), we set up a mapping from phoneme sequences to words self.phonemes_to_words_dict = self._get_phonemes_to_words_dict()
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=2, max_vocab_size=None): ''' Initializes the RandomGreedyEncoder. The encoder will greedily group digits to make words as long as possible (where the length of a word refers to the number of digits it encodes). The randomly selected word will be from the max_vocab_size most common words in both the CMU list and the Brown corpus. ''' super(RandomGreedyEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict, max_word_length=max_word_length) if max_vocab_size != None: vocabulary = self._get_vocab(max_vocab_size) self.phonemes_to_words_dict = self._get_phonemes_to_words_dict( vocabulary)
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_vocab_size=10000, parser=Parser(), evaluator=NgramEvaluator(2)): ''' Initializes the ParserEncoder. ''' super(ParserEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict) # set up our size-limited vocab if max_vocab_size != None: vocabulary = self._get_vocab(max_vocab_size) self.phonemes_to_words_dict = self._get_phonemes_to_words_dict( vocabulary) else: self.phonemes_to_words_dict = self._get_phonemes_to_words_dict() self.parser = parser self.evaluator = evaluator
def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None, max_word_length=None, min_sentence_length=5, n=3, alpha=0.1, select_most_likely=True, tagger_type=UnigramTagger, tagged_sents=brown.tagged_sents(tagset='universal'), num_sentence_templates=100, word_length_weight=10): ''' Initializes the SentenceTaggerEncoder. Uses the num_sentence_templates most common part-of-speech sentence types, requiring at least min_sentence_length words per sentence. Favors words that encode more digits as a function of word_length_weight (0 means unweighted). Scores words and sentences through an n-gram model with the given n and alpha. ''' super(SentenceTaggerEncoder, self).__init__(pronouncer=pronouncer, phoneme_to_digit_dict=phoneme_to_digit_dict, max_word_length=max_word_length, min_sentence_length=min_sentence_length, n=n, alpha=alpha, select_most_likely=select_most_likely) # set up our tagger and sentence_templates self.tagger = tagger_type(tagged_sents) self.tagged_sents = tagged_sents self.num_sentence_templates = num_sentence_templates # some parts of speech can be reasonably omitted from any sentence - we call these optional self.optional_tags = ['DET', 'ADJ', 'ADV'] self.sentence_templates = self._get_sentence_templates() self.word_length_weight = word_length_weight # set up bigram model for post processing self.bigram = NgramModel(n=2, alpha=0.05)
def main(): ''' Main function for Major System. ''' # Demonstrate Pronouncer # print('=== Demonstrating the Pronouncer class ===') # pronouncer = Pronouncer() # number_encoder = NumberEncoder(pronouncer = pronouncer) # words = ['apple', 'ABUSE', 'Rainbow'] # for word in words: # print('-- Pronunciation and decoding for \'{0}\' --'.format(word)) # print('\'{0}\' has pronunciations (with stress): {1}'.format(word, # str(pronouncer.pronounce(word, strip_stress = False)))) # print('\'{0}\' has pronunciations (no stress): {1}'.format(word, # str(pronouncer.pronounce(word)))) # print('\'{0}\' can be pronounced: {1}'.format(word, # str(' '.join(pronouncer.pronounce(word)[0])))) # print('\'{0}\' is an encoding for the number {1}'.format(word, # number_encoder.decode_word(word))) # print('The sentence \'{0}\' is an encoding for the number {1}'.format(' '.join(words), # number_encoder.decode_words(words))) # Demonstrate NgramEvaluator # print('\n=== Demonstrating the NgramEvaluator class ===') # evaluator = NgramEvaluator(2) # sentences = ['this is a simple sentence', 'simple this a sentence is'] # for sentence in sentences: # score = evaluator.score(sentence.split()) # print('The score for the sentence \'{0}\' is {1}.'.format(sentence, score)) # Demonstrate RandomGreedyEncoder # print('\n=== Demonstrating the RandomGreedyEncoder class ===') # vocab_sizes = [1000, 10000, 50000, None] # numbers = ['123', '123', '451', '451', '12345', '0123456789', # '31415926535897932384626433832795028841971693993751'] # for vocab_size in vocab_sizes: # print('-- Restricting vocabulary size to {0} --'.format(vocab_size)) # random_greedy_encoder = RandomGreedyEncoder(pronouncer = pronouncer, max_word_length = 2, # max_vocab_size = 50000) # for max_word_length in [1, 2, 3, 10]: # print('-- Encoding with max_word_length {0} --'.format(max_word_length)) # for number in numbers: # encoding = random_greedy_encoder.encode_number(number, # max_word_length=max_word_length) # decoding = random_greedy_encoder.decode_words(encoding) # print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'. # format(number, encoding, decoding)) # score = evaluator.score(encoding) # print('The score for this encoding is {0}.'.format(score)) # Demonstrate UnigramGreedyEncoder # print('\n=== Demonstrating the UnigramGreedyEncoder class ===') # unigram_greedy_encoder = UnigramGreedyEncoder(pronouncer = pronouncer, max_word_length = 2) # numbers = ['123', '451', '12345', '0123456789', # '31415926535897932384626433832795028841971693993751'] # for max_word_length in [1, 2, 3, 10]: # print('-- Encoding with max_word_length {0} --'.format(max_word_length)) # for number in numbers: # encoding = unigram_greedy_encoder.encode_number(number, max_word_length=max_word_length) # decoding = unigram_greedy_encoder.decode_words(encoding) # print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'. # format(number, encoding, decoding)) # score = evaluator.score(encoding) # print('The score for this encoding is {0}.'.format(score)) # Demonstrate NgramContextEncoder # print('\n=== Demonstrating the NgramContextEncoder class ===') # for ngram_n in [1, 2, 3]: # print('\n-- Encoding with n-gram model n = {0} --'.format(ngram_n)) # ngram_context_encoder = NgramContextEncoder(pronouncer = pronouncer, max_word_length = 5, # n = ngram_n, alpha = 0.1, select_most_likely = True) # evaluator = NgramEvaluator(ngram_n) # numbers = ['123', '451', '12345', '0123456789', # '31415926535897932384626433832795028841971693993751'] # for number in numbers: # encoding = ngram_context_encoder.encode_number(number) # decoding = ngram_context_encoder.decode_words(encoding) # print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'. # format(number, encoding, decoding)) # score = evaluator.score(encoding) # print('The score for this encoding is {0}.'.format(score)) # Preliminary results # pronouncer = Pronouncer() # evaluator = NgramEvaluator(2) # rge = RandomGreedyEncoder(pronouncer = pronouncer, max_word_length = 2, max_vocab_size = 50000) # uge = UnigramGreedyEncoder(pronouncer = pronouncer, max_word_length = 2) # nce = NgramContextEncoder(pronouncer = pronouncer, min_sentence_length = 5, n = 3, alpha = 0.1) # npce = NgramPOSContextEncoder(pronouncer = pronouncer, min_sentence_length = 5, n = 3, # alpha = 0.1, select_most_likely = True) # pe = ParserEncoder(pronouncer = pronouncer, evaluator = NgramEvaluator(2)) # for number in ['123456789']: # for encoder in [rge, uge, nce, npce, pe]: # encoding = encoder.encode_number(number) # perplexity = evaluator.perplexity(encoding) # print('Encoding for \'{0}\' with encoder {1} has perplexity {2} and is: {3}'.format( # number, encoder, perplexity, encoding)) # Final results print('Initializing models..') pronouncer = Pronouncer() evaluator = NgramEvaluator(2) rge = RandomGreedyEncoder(pronouncer=pronouncer, max_word_length=2, max_vocab_size=50000) uge = UnigramGreedyEncoder(pronouncer=pronouncer, max_word_length=2) nce = NgramContextEncoder(pronouncer=pronouncer, min_sentence_length=5, n=3, alpha=0.1) npce = NgramPOSContextEncoder(pronouncer=pronouncer, min_sentence_length=5, n=3, alpha=0.1, select_most_likely=True) pe = ParserEncoder(pronouncer=pronouncer, evaluator=NgramEvaluator(2)) ste = SentenceTaggerEncoder(pronouncer=pronouncer) print('Models initialized.') for number in [ '123456789', '0987654321', '3141592653', '31415926535897932384626433832795028841971693993751' ]: print() for encoder in [rge, uge, nce, npce, pe, ste]: # the ParserEncoder takes too long to encode long numbers if encoder == pe and len(number) > 10: continue encoding = encoder.encode_number(number) perplexity = evaluator.perplexity(encoding) print( '{0} encoding for \'{1}\' has perplexity {2:.0f}: {3}'.format( encoder, number, perplexity, encoder.format_encoding(encoding)))