コード例 #1
0
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        return 0

    def getVocabulary(self, context):
        return []

    def generateWord(self, context):
        return 'bunny'

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result
コード例 #2
0
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = 0.5
        self.lambda_2 = 0.5

    '''
    Trains a bigram-interpolation language model on a training set.
    '''

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''

    def getWordProbability(self, sentence, index):
        return (
            self.lambda_1 * self.bigram.getWordProbability(sentence, index) +
            self.lambda_2 * self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''

    def generateWord(self, context):
        return 'bunny'
コード例 #3
0
class BigramInterpolation(LanguageModel):

    def __init__(self, lambda_1=0.67):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = lambda_1
        self.lambda_2 = 1 - lambda_1
    
    '''
    Trains a bigram-interpolation language model on a training set.
    '''
    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)
    
    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''
    def getWordProbability(self, sentence, index):
        return (self.lambda_1*self.bigram.getWordProbability(sentence, index)
                +self.lambda_2*self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''
    def generateWord(self, context):
        if context:
            previous_word = context[-1]
        else:
            previous_word = LanguageModel.START

        if (previous_word not in self.word_dict) and (previous_word != LanguageModel.START):
            previous_word = LanguageModel.UNK

        if previous_word == LanguageModel.START:
            previous_word_index = 0
        else:
            previous_word_index = self.word_dict[previous_word]

        probs_bigram = self.bigram.prob_counter[previous_word_index].toarray().ravel()
        probs_unigram = self.unigram.prob_counter[0].toarray().ravel()

        # Because the unigram model and bigram model have different word index for STOP, I need to make some adjustment
        stop_index = self.unigram.word_dict[LanguageModel.STOP]
        # move STOP probability to the first element of probs_unigram and leave the others unchanged
        stop_prob = probs_unigram[stop_index]
        probs_unigram = np.append(stop_prob, np.delete(probs_unigram, stop_index))
        probs = self.lambda_1*probs_bigram + self.lambda_2*probs_unigram  # Get the interpolation probability

        word_list = sorted(self.word_dict.items(), key=lambda item: item[1])
        word_list = [k[0] for k in word_list]

        return np.random.choice(word_list, p=probs)
コード例 #4
0
def main():
    bg = Bigram()
    bg.train()
    print(sys.argv[1])
    p, q, r = bg.test(sys.argv[1])
    print("------Unsmooth Probability---------")
    print('{:.60f}'.format(p))
    print("------Laplace Smooth Prob---------")
    print('{:.60f}'.format(q))
    print("------Good Turing Prob---------")
    print('{:.60f}'.format(r))
コード例 #5
0
class Interpolation(LanguageModel):
    def __init__(self):
        self.unigram_model = Unigram()
        self.bigram_model = Bigram()
        self.trigram_model = Trigram()
        self.unigram_lambda = .25
        self.bigram_lambda = .25
        self.trigram_lambda = .5

    def train(self, trainingSentences):
        self.unigram_model.train(trainingSentences)
        self.bigram_model.train(trainingSentences)
        self.trigram_model.train(trainingSentences)

    #Arbitrary lambdas.
    def getWordProbability(self, sentence, index):
        return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \
               + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \
               + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index))

    #Doesn't matter which model we use here- vocabulary is the same
    def getVocabulary(self, context):
        return self.trigram_model.getVocabulary(context)

    #What does generating a sentence in an interpolation model look like?
    #I don't know, so what I've done is generate a word using trigram, bigram, and
    #unigram model some of the time, using the same values in getWordProbability
    def generateSentence(self):
        sentence = []
        prev_previous = LanguageModel.START
        previous = random.choice(list(self.trigram_model.word_count.keys()))
        for i in range(20):
            model_choice = random.random()
            if model_choice <= self.trigram_lambda:
                word = self.trigram_model.generateWord(prev_previous, previous)
            elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda:
                word = self.bigram_model.generate_word(previous)
            else:
                word = self.unigram_model.generateWord()
            sentence.append(word)
            prev_previous = previous
            previous = word
            if word == LanguageModel.STOP:
                break
        return sentence
コード例 #6
0
ファイル: test.py プロジェクト: Elixeus/NLP
from bigram import Bigram
import os
import re

if __name__ == '__main__':
    bg = Bigram()
    bg.train(os.path.abspath('../darksouls_training.txt'))
    print 'model trained'
    # for key, item in bg.get_model().iteritems():
    #     print key, item
    bg.test('../darksouls_test.txt')
    print 'The entropy for the test set is: {:.2f}.'.format(bg.entropy)
    print 'The perplexity for the test set is: {:.2f}.'.format(bg.perplexity)
class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        self.coef = 0.5
        print("W(bigram):W(unigram) coefficient is 1 :", self.coef)

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        coef = self.coef
        x = 1 / (1 + coef)

        if index == len(sentence):
            word = LanguageModel.STOP
            prev_word = sentence[-1]
        elif index == 0:
            word = sentence[0]
            prev_word = LanguageModel.START
        else:
            word = sentence[index]
            prev_word = sentence[index - 1]

        if prev_word not in self.bigram.probCounter:
            prev_word = LanguageModel.UNK

        if self.bigram.probCounter[prev_word][word] == 0:
            return x * coef * self.unigram.getWordProbability(sentence, index)
        else:
            return x * self.bigram.getWordProbability(
                sentence, index) + x * coef * self.unigram.getWordProbability(
                    sentence, index)

    def getVocabulary(self, context):

        next_posb_word = []
        # append all possible word except START in self.total
        for next_word in self.bigram.total:
            if next_word != LanguageModel.START:
                next_posb_word.append(next_word)
        # append STOP manually since there is no STOP in self.total
        next_posb_word.append(LanguageModel.STOP)

        return next_posb_word

    def generateWord(self, context):

        return self.bigram.generateWord(context)

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result