Ejemplo n.º 1
0
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger=UnigramTagger(brown.tagged_sents())):
        '''
        Initializes the NgramPOSContextEncoder. The encoder will consider the context of at most
        (n - 1) previous words and choose the subsequent word with highest probability 
        part-of-speech and with highest n-gram probability if select_most_likely is True 
        (otherwise, will sample weighted by probabilities).
        '''
        super(NgramPOSContextEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             context_length=n - 1,
                             min_sentence_length=min_sentence_length)
        self.ngram = NgramModel(n, alpha=alpha)
        self.select_most_likely = select_most_likely

        self.tagger = tagger
        self.pos_ngram = NgramPOSModel(n, alpha=alpha)
Ejemplo n.º 2
0
 def __init__(self, n, alpha=None, ngram_model=None):
     self.n = n
     if ngram_model != None:
         self.language_model = ngram_model
     elif alpha != None:
         self.language_model = NgramModel(n, alpha=alpha)
     else:
         self.language_model = NgramModel(n)
Ejemplo n.º 3
0
class NgramContextEncoder(ContextEncoder):
    '''
    NgramContextEncoder is a ContextEncoder that selects the most common encoding according to an
    n-gram model.
    '''
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True):
        '''
        Initializes the NgramContextEncoder. The encoder will consider the context of at most
        (n - 1) previous words and choose the subsequent word with highest n-gram probability
        if select_most_likely is True (otherwise, will sample weighted by probabilities).
        '''
        super(NgramContextEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             context_length=n - 1,
                             min_sentence_length=min_sentence_length)
        self.ngram = NgramModel(n, alpha=alpha)
        self.select_most_likely = select_most_likely

    def _select_encoding(self, previous_words, encodings):
        '''
        Selects the most common encoding according to n-gram probabilities.
        '''
        if len(encodings) == 0:
            return None
        if self.select_most_likely:
            max_prob = -float('inf')
            max_prob_encoding = None
            for encoding in encodings:
                prob = self.ngram.prob(previous_words, encoding)
                if prob > max_prob:
                    max_prob = prob
                    max_prob_encoding = encoding
            return max_prob_encoding
        else:
            probabilities = [
                exp(self.ngram.prob(previous_words, encoding))
                for encoding in encodings
            ]
            probability_sum = sum(probabilities)
            probabilities_norm = [
                probability / probability_sum for probability in probabilities
            ]
            return choice(encodings, p=probabilities_norm)
Ejemplo n.º 4
0
class NgramEvaluator(object):
    '''
    Evaluates the likelihood of a given list of words appearing in text based on an N-gram
    language model.
    '''
    def __init__(self, n, alpha=None, ngram_model=None):
        self.n = n
        if ngram_model != None:
            self.language_model = ngram_model
        elif alpha != None:
            self.language_model = NgramModel(n, alpha=alpha)
        else:
            self.language_model = NgramModel(n)

    def score(self, phrase):
        '''
        Returns a score for the given phrase (a list of strings) based on the log likelihood of 
        seeing this phrase from the language model.
        '''
        # score the first n - 1 words in the phrase
        score = 0
        for m in range(min(self.n, len(phrase))):
            context = tuple(phrase[0:m])
            word = phrase[m]
            score += self.language_model.prob(context, word)

        # if the input phrase has fewer than n words, simply score the phrase without breaking
        # into n-grams
        if len(phrase) < self.n:
            return score

        # otherwise, sum the log probabilities of each n-gram
        phrase = [word.lower() for word in phrase]
        grams = ngrams(phrase, self.n)
        for gram in grams:
            context = gram[0:self.n - 1]
            word = gram[-1]
            score += self.language_model.prob(context, word)
        return score

    def perplexity(self, phrase):
        '''
        Returns the perplexity of the given phrase (a list of strings).
        '''
        log_prob = self.score(phrase)
        prob = pow(e, log_prob)
        # if the log_prob is low enough, prob will be 0.0, which causes a ZeroDivisionError
        if prob == 0:
            return float('inf')
        perplexity = pow(prob, -1 / len(phrase))
        return perplexity
Ejemplo n.º 5
0
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=2):
        '''
        Initializes the UnigramGreedyEncoder. The encoder will greedily group digits to make words
        as long as possible (where the length of a word refers to the number of digits it encodes).
        '''
        super(UnigramGreedyEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length)

        self.unigram = NgramModel(1)
Ejemplo n.º 6
0
class UnigramGreedyEncoder(GreedyEncoder):
    '''
    UnigramGreedyEncoder is a GreedyEncoder that selects the most common encoding according to a
    unigram model.
    '''
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=2):
        '''
        Initializes the UnigramGreedyEncoder. The encoder will greedily group digits to make words
        as long as possible (where the length of a word refers to the number of digits it encodes).
        '''
        super(UnigramGreedyEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length)

        self.unigram = NgramModel(1)

    def _select_encoding(self, encodings):
        '''
        Selects the most common encoding according to unigram probabilities.
        '''
        max_prob = -float('inf')
        max_prob_encoding = None
        for encoding in encodings:
            prob = self.unigram.prob((), encoding)
            if prob > max_prob:
                max_prob = prob
                max_prob_encoding = encoding
        return max_prob_encoding
Ejemplo n.º 7
0
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger_type=UnigramTagger,
                 tagged_sents=brown.tagged_sents(tagset='universal'),
                 num_sentence_templates=100,
                 word_length_weight=10):
        '''
        Initializes the SentenceTaggerEncoder.
        Uses the num_sentence_templates most common part-of-speech sentence types, requiring at
        least min_sentence_length words per sentence. Favors words that encode more digits as a
        function of word_length_weight (0 means unweighted). Scores words and sentences through an
        n-gram model with the given n and alpha.
        '''
        super(SentenceTaggerEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             min_sentence_length=min_sentence_length,
                             n=n,
                             alpha=alpha,
                             select_most_likely=select_most_likely)

        # set up our tagger and sentence_templates
        self.tagger = tagger_type(tagged_sents)
        self.tagged_sents = tagged_sents
        self.num_sentence_templates = num_sentence_templates
        # some parts of speech can be reasonably omitted from any sentence - we call these optional
        self.optional_tags = ['DET', 'ADJ', 'ADV']
        self.sentence_templates = self._get_sentence_templates()
        self.word_length_weight = word_length_weight
        # set up bigram model for post processing
        self.bigram = NgramModel(n=2, alpha=0.05)
Ejemplo n.º 8
0
class SentenceTaggerEncoder(NgramContextEncoder):
    '''
    SentenceTaggerEncoder is an NgramContextEncoder that creates sentences whose part-of-speech
    tags match a sentence in the training corpus, selecting from valid words by n-gram probability.
    '''
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger_type=UnigramTagger,
                 tagged_sents=brown.tagged_sents(tagset='universal'),
                 num_sentence_templates=100,
                 word_length_weight=10):
        '''
        Initializes the SentenceTaggerEncoder.
        Uses the num_sentence_templates most common part-of-speech sentence types, requiring at
        least min_sentence_length words per sentence. Favors words that encode more digits as a
        function of word_length_weight (0 means unweighted). Scores words and sentences through an
        n-gram model with the given n and alpha.
        '''
        super(SentenceTaggerEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             min_sentence_length=min_sentence_length,
                             n=n,
                             alpha=alpha,
                             select_most_likely=select_most_likely)

        # set up our tagger and sentence_templates
        self.tagger = tagger_type(tagged_sents)
        self.tagged_sents = tagged_sents
        self.num_sentence_templates = num_sentence_templates
        # some parts of speech can be reasonably omitted from any sentence - we call these optional
        self.optional_tags = ['DET', 'ADJ', 'ADV']
        self.sentence_templates = self._get_sentence_templates()
        self.word_length_weight = word_length_weight
        # set up bigram model for post processing
        self.bigram = NgramModel(n=2, alpha=0.05)

    def _get_sentence_templates(self):
        '''
        Returns a list of tuples containing a sentence template (POS tag tuple) and count.
        For example, an element of the returned list is of the form (('ADJ', 'NOUN'), 85).
        '''
        # extract the POS tags (not the actual words) from self.tagged_sents, skipping punctuation
        # tags
        sent_tags = [
            tuple([word_tag[1] for word_tag in tag_sent if word_tag[1] != '.'])
            for tag_sent in self.tagged_sents
        ]

        # filter out any sentence with an unknown word or a number
        def bad_tag(tag):
            return (tag == 'X' or tag == 'NUM')

        sent_tags_filtered = list(
            filter(lambda tag_sent: not any(bad_tag(tag) for tag in tag_sent),
                   sent_tags))
        # filter out any sentence that has no 'VERB'
        sent_tags_filtered = [
            tag_sent for tag_sent in sent_tags_filtered if 'VERB' in tag_sent
        ]

        # filter out any sentence that does not have enough required parts of speech (is too short)
        def long_enough(tag_sent):
            num_optional_tags = sum(
                [tag_sent.count(tag) for tag in self.optional_tags])
            num_required_tags = len(tag_sent) - num_optional_tags
            return num_required_tags >= self.min_sentence_length

        sent_tags_filtered = [
            tag_sent for tag_sent in sent_tags_filtered
            if long_enough(tag_sent)
        ]
        # select only the self.num_sentence_templates most common templates
        templates = Counter(sent_tags_filtered).most_common(
            self.num_sentence_templates)
        return templates

    def _get_sentence_template(self, templates):
        '''
        Randomly selects a sentence template (a tuple of POS tags) from the given templates.
        Returns a tuple containing the template and its index.
        '''
        probs = []
        total_prob = 0
        for template in templates:
            template_prob = template[1]
            probs += [template_prob]
            total_prob += template_prob
        probs_norm = [prob / total_prob for prob in probs]
        template_sentences = [t[0] for t in templates]
        # choice only works with multiple items, so we manually check if there is only one template
        if len(template_sentences) == 1:
            return (template_sentences[0], 0)
        sentence_template_index = choice(len(template_sentences), p=probs_norm)
        return (template_sentences[sentence_template_index],
                sentence_template_index)

    def _select_encoding(self, previous_words, encodings):
        '''
        Selects the most common encoding according to n-gram probabilities, weighted toward the
        encoding that encodes the most digits by self.word_length_weight (0 is unweighted).
        '''
        if len(encodings) == 0:
            return None
        elif len(encodings) == 1:
            return encodings[0]
        word_length_weight = self.word_length_weight
        if self.select_most_likely:
            max_prob = -float('inf')
            max_prob_encoding = None
            for encoding in encodings:
                prob = exp(self.ngram.prob(previous_words, encoding))
                weighted_prob = prob * pow(len(self.decode_word(encoding)),
                                           word_length_weight)
                if weighted_prob > max_prob:
                    max_prob = weighted_prob
                    max_prob_encoding = encoding
            return max_prob_encoding
        else:
            probabilities = [
                exp(self.ngram.prob(previous_words, encoding)) *
                pow(len(self.decode_word(encoding)), word_length_weight)
                for encoding in encodings
            ]
            probability_sum = sum(probabilities)
            probabilities_norm = [
                probability / probability_sum for probability in probabilities
            ]
            return choice(encodings, p=probabilities_norm)

    def encode_number(self,
                      number,
                      max_word_length=None,
                      context_length=None,
                      num_times=1,
                      evaluator=None):
        '''
        Generates num_times encodings and returns the encoding with the lowest perplexity according
        to the evaluator (assumes the evaluator has a perplexity() function).
        '''
        if evaluator == None:
            evaluator = NgramEvaluator(self.ngram.n, ngram_model=self.ngram)
        # generate num_times encodings
        encodings = []
        for i in range(num_times):
            encoding = self.encode_number_once(number, max_word_length,
                                               context_length)
            encodings += [encoding]
        # if there is only one encoding, don't bother evaluating it
        if len(encodings) == 1:
            return encodings[0]
        # determine encoding with the lowest perplexity - note that all perplexities might be
        # infinite if the encoding is long
        min_perplexity = float('inf')
        min_perplexity_encoding = None
        for encoding in encodings:
            perplexity = evaluator.perplexity(encoding)
            if perplexity <= min_perplexity:
                min_perplexity = perplexity
                min_perplexity_encoding = encoding
        return min_perplexity_encoding

    def encode_number_once(self,
                           number,
                           max_word_length=None,
                           context_length=None):
        '''
        Encodes the given number (string of digits) as a series of words. This series of words
        will be a series of sentences (separated by periods). Considers all possible digit chunks
        up to max_word_length and, based on the previous context_length words, selects a word that
        matches the needed part-of-speech tag via self._select_encoding().
        '''
        # if not given max_word_length, use class default
        if max_word_length == None:
            max_word_length = self.max_word_length

        # if not given context_length, use class default
        if context_length == None:
            context_length = self.context_length

        encoded_index = 0  # the last index of number we've encoded, inclusive
        encodings = []
        sentence_template = None
        templates = deepcopy(self.sentence_templates)
        while encoded_index < len(number):
            if (sentence_template == None) or (sentence_index
                                               == len(sentence_template) - 1):
                # if we successfully matched a sentence, we can sample from all templates
                if sentence_template != None:
                    templates = deepcopy(self.sentence_templates)
                if len(templates) > 0:
                    sentence_template, sentence_template_index = self._get_sentence_template(
                        templates)
                    del templates[
                        sentence_template_index]  # since we've used this template, remove it
                else:
                    # if we run out of sentence templates, we will accept any single word
                    sentence_template = ['*']
                sentence_index = 0
                if len(encodings) != 0:
                    encodings += ['.']
            else:
                sentence_index += 1
            # for all possible chunks starting at this position, find all possible encodings
            chunk_encodings = set()
            for chunk_length in range(1, max_word_length + 1):
                number_chunk = number[encoded_index:encoded_index +
                                      chunk_length]
                chunk_encodings |= set(self._encode_number_chunk(number_chunk))
            # filter out the chunk_encodings that do not match the needed part-of-speech tag
            pos_tag = sentence_template[sentence_index]
            pos_tags = [pos_tag]
            # if the pos_tag is a pronoun, we allow a noun instead
            if pos_tag == 'PRON':
                pos_tags += ['NOUN']
            if pos_tag == '*':
                # if the pos_tag is a wildcard, we allow any part of speech
                chunk_encodings = [encoding for encoding in chunk_encodings]
            else:
                chunk_encodings = [
                    encoding for encoding in chunk_encodings
                    if self.tagger.tag([encoding])[0][1] in pos_tags
                ]
            # select the best encoding from chunk_encodings
            context = tuple(encodings[len(encodings) -
                                      context_length:len(encodings)])
            # note: we could improve the context by adding a post_context (i.e., a period at end)
            chunk_encoding = self._select_encoding(context,
                                                   list(chunk_encodings))

            if chunk_encoding != None:
                encodings += [chunk_encoding]
                # increment encoded_index based on the chosen chunk_encoding
                encoded_index += len(self.decode_word(chunk_encoding))
            elif pos_tag not in self.optional_tags:
                # if none of the chunk_encodings matches the needed pos_tag, remove all encodings
                # used in the current sentence and select a (hopefully) new sentence template
                sentence_template = None
                if '.' not in encodings:
                    encodings = []
                    encoded_index = 0
                else:
                    last_period_index = (len(encodings) -
                                         1) - encodings[::-1].index('.')
                    partial_sentence = encodings[last_period_index + 1:]
                    encodings = encodings[:last_period_index]
                    partial_sentence_len = len(
                        self.decode_words(partial_sentence))
                    encoded_index -= partial_sentence_len
        encodings += ['.']

        encodings = self._post_process(encodings)

        return encodings

    def _post_process(self, encodings):
        '''
        Takes a set of encodings (series of words). For each encoding, produces all possible
        alternatives that encode the same sequence of digits and replaces the original encoding if
        an alternative encoding has a higher score, which is calculated as the sum of the bigram
        probabilities with the preceding and following encodings (where available). Ties are broken
        in favor of the original encoding.
        '''
        # if there are fewer than two encodings, there is no context available for post processing
        if len(encodings) < 2:
            return encodings

        # otherwise, score all possible alternatives for each encoding using bigram probabilities
        # with the preceding and following encodings (where available) and replace the original
        # encoding if a higher scoring encoding is found
        new_encodings = []
        for index, encoding in enumerate(encodings):
            decoding = self.decode_word(encoding)  # string of digits
            # if the encoding doesn't actually encode any numbers (i.e. we have a punctuation
            # mark), don't attempt to replace
            if len(decoding) == 0:
                new_encodings += [encoding]
                continue
            possible_encodings = self._encode_number_chunk(decoding)
            # default to the original encoding if multiple encodings have this probability
            if index == 0:  # first encoding, no previous context
                max_prob = self.bigram.prob((encoding, ), encodings[index + 1])
            elif index == len(encodings) - 1:  # last encoding, no next context
                max_prob = self.bigram.prob((encodings[index - 1], ), encoding)
            else:  # middle encoding, both previous and next contexts
                prev_prob = self.bigram.prob((encodings[index - 1], ),
                                             encoding)
                next_prob = self.bigram.prob((encoding, ),
                                             encodings[index + 1])
                max_prob = prev_prob + next_prob
            max_prob_encoding = encoding
            # score all possible alternatives
            for pos_encoding in possible_encodings:
                if index == 0:  # first encoding, no previous context
                    prob = self.bigram.prob((pos_encoding, ),
                                            encodings[index + 1])
                elif index == len(
                        encodings) - 1:  # last encoding, no next context
                    prob = self.bigram.prob((encodings[index - 1], ),
                                            pos_encoding)
                else:  # middle encodings, both previous and next contexts
                    prev_prob = self.bigram.prob((encodings[index - 1], ),
                                                 pos_encoding)
                    next_prob = self.bigram.prob((pos_encoding, ),
                                                 encodings[index + 1])
                    prob = prev_prob + next_prob
                if prob > max_prob:
                    max_prob = prob
                    max_prob_encoding = pos_encoding
            new_encodings += [max_prob_encoding]
        return new_encodings
Ejemplo n.º 9
0
class NgramPOSContextEncoder(ContextEncoder):
    '''
    NgramPOSContextEncoder is a ContextEncoder that selects the most common encoding according to 
    an n-gram model for parts-of-speech and words.
    '''
    def __init__(self,
                 pronouncer=Pronouncer(),
                 phoneme_to_digit_dict=None,
                 max_word_length=None,
                 min_sentence_length=5,
                 n=3,
                 alpha=0.1,
                 select_most_likely=True,
                 tagger=UnigramTagger(brown.tagged_sents())):
        '''
        Initializes the NgramPOSContextEncoder. The encoder will consider the context of at most
        (n - 1) previous words and choose the subsequent word with highest probability 
        part-of-speech and with highest n-gram probability if select_most_likely is True 
        (otherwise, will sample weighted by probabilities).
        '''
        super(NgramPOSContextEncoder,
              self).__init__(pronouncer=pronouncer,
                             phoneme_to_digit_dict=phoneme_to_digit_dict,
                             max_word_length=max_word_length,
                             context_length=n - 1,
                             min_sentence_length=min_sentence_length)
        self.ngram = NgramModel(n, alpha=alpha)
        self.select_most_likely = select_most_likely

        self.tagger = tagger
        self.pos_ngram = NgramPOSModel(n, alpha=alpha)

    def _select_encoding(self, previous_words, encodings):
        '''
        Finds the most common part-of-speech and selects the most common encoding with that
        part-of-speech according to n-gram probabilities.
        '''
        previous_pos = tuple(
            [tag for word, tag in self.tagger.tag(previous_words)])
        tag_to_words_dict = {}
        for word, tag in self.tagger.tag(encodings):
            if tag == None:
                continue
            if tag in tag_to_words_dict:
                tag_to_words_dict[tag] += [word]
            else:
                tag_to_words_dict[tag] = [word]

        if self.select_most_likely:
            return self._select_most_likely_encoding(previous_words,
                                                     previous_pos,
                                                     tag_to_words_dict,
                                                     encodings)
        else:
            return self._select_weighted_prob_encoding(previous_words,
                                                       previous_pos,
                                                       tag_to_words_dict,
                                                       encodings)

    def _select_most_likely_encoding(self, previous_words, previous_pos,
                                     tag_to_words_dict, encodings):
        # find most likely part-of-speech for the next word
        max_prob = -float('inf')
        max_prob_tag = None
        for tag in tag_to_words_dict.keys():
            prob = self.pos_ngram.prob(previous_pos, tag)
            if prob > max_prob:
                max_prob = prob
                max_prob_tag = tag
        # find most likely possible next word that has most likely POS tag
        if max_prob_tag != None:
            encodings = tag_to_words_dict[max_prob_tag]
        max_prob = -float('inf')
        max_prob_encoding = None
        for encoding in encodings:
            prob = self.ngram.prob(previous_words, encoding)
            if prob > max_prob:
                max_prob = prob
                max_prob_encoding = encoding
        return max_prob_encoding

    def _select_weighted_prob_encoding(self, previous_words, previous_pos,
                                       tag_to_words_dict, encodings):
        # select POS tag for next word from weighted probabilities
        tags = list(tag_to_words_dict.keys())
        pos_probabilities = [
            exp(self.pos_ngram.prob(previous_pos, tag)) for tag in tags
        ]
        pos_probability_sum = sum(pos_probabilities)
        pos_probabilities_norm = [
            prob / pos_probability_sum for prob in pos_probabilities
        ]
        tag = choice(tags, p=pos_probabilities_norm)
        # select next word with POS chosen above from weighted probabilties
        encodings = tag_to_words_dict[tag]
        probabilities = [
            exp(self.ngram.prob(previous_words, encoding))
            for encoding in encodings
        ]
        probability_sum = sum(probabilities)
        probabilities_norm = [
            probability / probability_sum for probability in probabilities
        ]
        return choice(encodings, p=probabilities_norm)