コード例 #1
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 context_length=2,
                 min_sentence_length=5):
        '''
        Initializes the ContextEncoder. The encoder will consider the context of at most
        context_length previous words to choose the best subsequent word.
        '''
        super(ContextEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict)
        # with the default encoding, the maximum number of digits in any word is 19
        max_digits_per_word = 19
        # if we were given an unusual max_word_length, set it to max_digits_per_word
        if max_word_length == None or max_word_length < 1 or max_word_length > max_digits_per_word:
            max_word_length = max_digits_per_word
        self.max_word_length = max_word_length

        max_reasonable_context = 5  # this limit is arbitrary; 5 was chosen with n-grams in mind
        # if we were given an unusual context_length, set it to max_reasonable_context
        if context_length == None or context_length < 0 or context_length > max_reasonable_context:
            context_length = max_reasonable_context
        self.context_length = context_length

        # to aid encode_number(), we set up a mapping from phoneme sequences to words
        self.phonemes_to_words_dict = self._get_phonemes_to_words_dict()

        # minimum number of words required per sentence
        self.min_sentence_length = min_sentence_length
コード例 #2
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger=UnigramTagger(brown.tagged_sents())):
        '''
        Initializes the NgramPOSContextEncoder. The encoder will consider the context of at most
        (n - 1) previous words and choose the subsequent word with highest probability 
        part-of-speech and with highest n-gram probability if select_most_likely is True 
        (otherwise, will sample weighted by probabilities).
        '''
        super(NgramPOSContextEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             context_length=n - 1,
                             min_sentence_length=min_sentence_length)
        self.ngram = NgramModel(n, alpha=alpha)
        self.select_most_likely = select_most_likely

        self.tagger = tagger
        self.pos_ngram = NgramPOSModel(n, alpha=alpha)
コード例 #3
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
 def __init__(self, pronouncer=Pronouncer(), phoneme_to_digit_dict=None):
     '''
     Initializes the NumberEncoder. If given a pronouncer, uses it. Also, if given a dictionary
     mapping from phonemes to digits (length-one strings), uses it.
     '''
     self.pronouncer = pronouncer
     if phoneme_to_digit_dict:
         self.phoneme_to_digit_dict = phoneme_to_digit_dict
     else:
         self.phoneme_to_digit_dict = self._get_phoneme_to_digit_dict()
コード例 #4
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=2):
        '''
        Initializes the UnigramGreedyEncoder. The encoder will greedily group digits to make words
        as long as possible (where the length of a word refers to the number of digits it encodes).
        '''
        super(UnigramGreedyEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length)

        self.unigram = NgramModel(1)
コード例 #5
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
 def __init__(self,
              pronouncer=Pronouncer(),
              phoneme_to_digit_dict=None,
              max_word_length=2):
     '''
     Initializes the GreedyEncoder. The encoder will greedily group digits to make words as long
     as possible (where the length of a word refers to the number of digits it encodes).
     '''
     super(GreedyEncoder,
           self).__init__(pronouncer=pronouncer,
                          phoneme_to_digit_dict=phoneme_to_digit_dict)
     # with the default encoding, the maximum number of digits in any word is 19
     max_digits_per_word = 19
     # if we were given an unusual max_word_length, set it to max_digits_per_word
     if max_word_length == None or max_word_length < 1 or max_word_length > max_digits_per_word:
         max_word_length = max_digits_per_word
     self.max_word_length = max_word_length
     # to aid encode_number(), we set up a mapping from phoneme sequences to words
     self.phonemes_to_words_dict = self._get_phonemes_to_words_dict()
コード例 #6
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=2,
                 max_vocab_size=None):
        '''
        Initializes the RandomGreedyEncoder. The encoder will greedily group digits to make words
        as long as possible (where the length of a word refers to the number of digits it encodes).
        The randomly selected word will be from the max_vocab_size most common words in both the 
        CMU list and the Brown corpus.
        '''
        super(RandomGreedyEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length)

        if max_vocab_size != None:
            vocabulary = self._get_vocab(max_vocab_size)
            self.phonemes_to_words_dict = self._get_phonemes_to_words_dict(
                vocabulary)
コード例 #7
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_vocab_size=10000,
                 parser=Parser(),
                 evaluator=NgramEvaluator(2)):
        '''
        Initializes the ParserEncoder.
        '''
        super(ParserEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict)
        # set up our size-limited vocab
        if max_vocab_size != None:
            vocabulary = self._get_vocab(max_vocab_size)
            self.phonemes_to_words_dict = self._get_phonemes_to_words_dict(
                vocabulary)
        else:
            self.phonemes_to_words_dict = self._get_phonemes_to_words_dict()

        self.parser = parser
        self.evaluator = evaluator
コード例 #8
0
ファイル: number_encoder.py プロジェクト: w1r4/major-system
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger_type=UnigramTagger,
                 tagged_sents=brown.tagged_sents(tagset='universal'),
                 num_sentence_templates=100,
                 word_length_weight=10):
        '''
        Initializes the SentenceTaggerEncoder.
        Uses the num_sentence_templates most common part-of-speech sentence types, requiring at
        least min_sentence_length words per sentence. Favors words that encode more digits as a
        function of word_length_weight (0 means unweighted). Scores words and sentences through an
        n-gram model with the given n and alpha.
        '''
        super(SentenceTaggerEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             min_sentence_length=min_sentence_length,
                             n=n,
                             alpha=alpha,
                             select_most_likely=select_most_likely)

        # set up our tagger and sentence_templates
        self.tagger = tagger_type(tagged_sents)
        self.tagged_sents = tagged_sents
        self.num_sentence_templates = num_sentence_templates
        # some parts of speech can be reasonably omitted from any sentence - we call these optional
        self.optional_tags = ['DET', 'ADJ', 'ADV']
        self.sentence_templates = self._get_sentence_templates()
        self.word_length_weight = word_length_weight
        # set up bigram model for post processing
        self.bigram = NgramModel(n=2, alpha=0.05)
コード例 #9
0
ファイル: major_system.py プロジェクト: w1r4/major-system
def main():
    '''
    Main function for Major System.
    '''

    # Demonstrate Pronouncer
    # print('=== Demonstrating the Pronouncer class ===')
    # pronouncer = Pronouncer()
    # number_encoder = NumberEncoder(pronouncer = pronouncer)
    # words = ['apple', 'ABUSE', 'Rainbow']
    # for word in words:
    #     print('-- Pronunciation and decoding for \'{0}\' --'.format(word))
    #     print('\'{0}\' has pronunciations (with stress): {1}'.format(word,
    #         str(pronouncer.pronounce(word, strip_stress = False))))
    #     print('\'{0}\' has pronunciations (no stress): {1}'.format(word,
    #         str(pronouncer.pronounce(word))))
    #     print('\'{0}\' can be pronounced: {1}'.format(word,
    #         str(' '.join(pronouncer.pronounce(word)[0]))))
    #     print('\'{0}\' is an encoding for the number {1}'.format(word,
    #         number_encoder.decode_word(word)))
    # print('The sentence \'{0}\' is an encoding for the number {1}'.format(' '.join(words),
    #     number_encoder.decode_words(words)))

    # Demonstrate NgramEvaluator
    # print('\n=== Demonstrating the NgramEvaluator class ===')
    # evaluator = NgramEvaluator(2)
    # sentences = ['this is a simple sentence', 'simple this a sentence is']
    # for sentence in sentences:
    #     score = evaluator.score(sentence.split())
    #     print('The score for the sentence \'{0}\' is {1}.'.format(sentence, score))

    # Demonstrate RandomGreedyEncoder
    # print('\n=== Demonstrating the RandomGreedyEncoder class ===')
    # vocab_sizes = [1000, 10000, 50000, None]
    # numbers = ['123', '123', '451', '451', '12345', '0123456789',
    #            '31415926535897932384626433832795028841971693993751']
    # for vocab_size in vocab_sizes:
    #     print('-- Restricting vocabulary size to {0} --'.format(vocab_size))
    #     random_greedy_encoder = RandomGreedyEncoder(pronouncer = pronouncer, max_word_length = 2,
    #         max_vocab_size = 50000)
    #     for max_word_length in [1, 2, 3, 10]:
    #         print('-- Encoding with max_word_length {0} --'.format(max_word_length))
    #         for number in numbers:
    #             encoding = random_greedy_encoder.encode_number(number,
    #                 max_word_length=max_word_length)
    #             decoding = random_greedy_encoder.decode_words(encoding)
    #             print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'.
    #                 format(number, encoding, decoding))
    #             score = evaluator.score(encoding)
    #             print('The score for this encoding is {0}.'.format(score))

    # Demonstrate UnigramGreedyEncoder
    # print('\n=== Demonstrating the UnigramGreedyEncoder class ===')
    # unigram_greedy_encoder = UnigramGreedyEncoder(pronouncer = pronouncer, max_word_length = 2)
    # numbers = ['123', '451', '12345', '0123456789',
    #            '31415926535897932384626433832795028841971693993751']
    # for max_word_length in [1, 2, 3, 10]:
    #     print('-- Encoding with max_word_length {0} --'.format(max_word_length))
    #     for number in numbers:
    #         encoding = unigram_greedy_encoder.encode_number(number, max_word_length=max_word_length)
    #         decoding = unigram_greedy_encoder.decode_words(encoding)
    #         print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'.
    #             format(number, encoding, decoding))
    #         score = evaluator.score(encoding)
    #         print('The score for this encoding is {0}.'.format(score))

    # Demonstrate NgramContextEncoder
    # print('\n=== Demonstrating the NgramContextEncoder class ===')
    # for ngram_n in [1, 2, 3]:
    #     print('\n-- Encoding with n-gram model n = {0} --'.format(ngram_n))
    #     ngram_context_encoder = NgramContextEncoder(pronouncer = pronouncer, max_word_length = 5,
    #         n = ngram_n, alpha = 0.1, select_most_likely = True)
    #     evaluator = NgramEvaluator(ngram_n)
    #     numbers = ['123', '451', '12345', '0123456789',
    #                '31415926535897932384626433832795028841971693993751']
    #     for number in numbers:
    #         encoding = ngram_context_encoder.encode_number(number)
    #         decoding = ngram_context_encoder.decode_words(encoding)
    #         print('The number \'{0}\' can be encoded as \'{1}\' (which decodes to \'{2}\').'.
    #             format(number, encoding, decoding))
    #         score = evaluator.score(encoding)
    #         print('The score for this encoding is {0}.'.format(score))

    # Preliminary results
    # pronouncer = Pronouncer()
    # evaluator = NgramEvaluator(2)
    # rge = RandomGreedyEncoder(pronouncer = pronouncer, max_word_length = 2, max_vocab_size = 50000)
    # uge = UnigramGreedyEncoder(pronouncer = pronouncer, max_word_length = 2)
    # nce = NgramContextEncoder(pronouncer = pronouncer, min_sentence_length = 5, n = 3, alpha = 0.1)
    # npce = NgramPOSContextEncoder(pronouncer = pronouncer, min_sentence_length = 5, n = 3,
    #     alpha = 0.1, select_most_likely = True)
    # pe = ParserEncoder(pronouncer = pronouncer, evaluator = NgramEvaluator(2))
    # for number in ['123456789']:
    #     for encoder in [rge, uge, nce, npce, pe]:
    #         encoding = encoder.encode_number(number)
    #         perplexity = evaluator.perplexity(encoding)
    #         print('Encoding for \'{0}\' with encoder {1} has perplexity {2} and is: {3}'.format(
    #             number, encoder, perplexity, encoding))

    # Final results
    print('Initializing models..')
    pronouncer = Pronouncer()
    evaluator = NgramEvaluator(2)
    rge = RandomGreedyEncoder(pronouncer=pronouncer,
                              max_word_length=2,
                              max_vocab_size=50000)
    uge = UnigramGreedyEncoder(pronouncer=pronouncer, max_word_length=2)
    nce = NgramContextEncoder(pronouncer=pronouncer,
                              min_sentence_length=5,
                              n=3,
                              alpha=0.1)
    npce = NgramPOSContextEncoder(pronouncer=pronouncer,
                                  min_sentence_length=5,
                                  n=3,
                                  alpha=0.1,
                                  select_most_likely=True)
    pe = ParserEncoder(pronouncer=pronouncer, evaluator=NgramEvaluator(2))
    ste = SentenceTaggerEncoder(pronouncer=pronouncer)
    print('Models initialized.')
    for number in [
            '123456789', '0987654321', '3141592653',
            '31415926535897932384626433832795028841971693993751'
    ]:
        print()
        for encoder in [rge, uge, nce, npce, pe, ste]:
            # the ParserEncoder takes too long to encode long numbers
            if encoder == pe and len(number) > 10:
                continue
            encoding = encoder.encode_number(number)
            perplexity = evaluator.perplexity(encoding)
            print(
                '{0} encoding for \'{1}\' has perplexity {2:.0f}: {3}'.format(
                    encoder, number, perplexity,
                    encoder.format_encoding(encoding)))