Esempi in Python per Bigram, esempi in Python per bigram.Bigram

Esempio n. 1

0

Mostra file

class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        return 0

    def getVocabulary(self, context):
        return []

    def generateWord(self, context):
        return 'bunny'

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result

Esempio n. 2

0

Mostra file

class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = 0.5
        self.lambda_2 = 0.5

    '''
    Trains a bigram-interpolation language model on a training set.
    '''

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''

    def getWordProbability(self, sentence, index):
        return (
            self.lambda_1 * self.bigram.getWordProbability(sentence, index) +
            self.lambda_2 * self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''

    def generateWord(self, context):
        return 'bunny'

Esempio n. 3

0

Mostra file

 def __init__(self):
     self.docs = dict()
     self.index = dict()
     self.vecs = None
     self.consts = None
     self.modified = False
     self.bigram_index = Bigram()

Esempio n. 4

0

Mostra file

 def __init__(self):
     self.unigram_model = Unigram()
     self.bigram_model = Bigram()
     self.trigram_model = Trigram()
     self.unigram_lambda = .25
     self.bigram_lambda = .25
     self.trigram_lambda = .5

Esempio n. 5

0

Mostra file

File: bigram_interpolation.py Progetto: GYHenryTT/Computational-Linguistic

 def __init__(self, lambda_1=0.67):
     self.unigram = Unigram()
     self.bigram = Bigram()
     # just needed for languageModel.py to work
     self.word_dict = self.bigram.word_dict
     self.lambda_1 = lambda_1
     self.lambda_2 = 1 - lambda_1

Esempio n. 6

0

Mostra file

def main():
    raw_data = get_data()
    # Unigram
    uni = Unigram(raw_data)
    uni.main()
    # Bigram
    bi = Bigram(raw_data)
    bi.main()

Esempio n. 7

0

Mostra file

File: bigram_interpolation.py Progetto: GYHenryTT/Computational-Linguistic

class BigramInterpolation(LanguageModel):

    def __init__(self, lambda_1=0.67):
        self.unigram = Unigram()
        self.bigram = Bigram()
        # just needed for languageModel.py to work
        self.word_dict = self.bigram.word_dict
        self.lambda_1 = lambda_1
        self.lambda_2 = 1 - lambda_1
    
    '''
    Trains a bigram-interpolation language model on a training set.
    '''
    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)
    
    '''
    Returns the probability of the word at index, according to the model, within
    the specified sentence.
    '''
    def getWordProbability(self, sentence, index):
        return (self.lambda_1*self.bigram.getWordProbability(sentence, index)
                +self.lambda_2*self.unigram.getWordProbability(sentence, index))

    '''
    Returns, for a given context, a random word, according to the probabilities
    in the model.
    '''
    def generateWord(self, context):
        if context:
            previous_word = context[-1]
        else:
            previous_word = LanguageModel.START

        if (previous_word not in self.word_dict) and (previous_word != LanguageModel.START):
            previous_word = LanguageModel.UNK

        if previous_word == LanguageModel.START:
            previous_word_index = 0
        else:
            previous_word_index = self.word_dict[previous_word]

        probs_bigram = self.bigram.prob_counter[previous_word_index].toarray().ravel()
        probs_unigram = self.unigram.prob_counter[0].toarray().ravel()

        # Because the unigram model and bigram model have different word index for STOP, I need to make some adjustment
        stop_index = self.unigram.word_dict[LanguageModel.STOP]
        # move STOP probability to the first element of probs_unigram and leave the others unchanged
        stop_prob = probs_unigram[stop_index]
        probs_unigram = np.append(stop_prob, np.delete(probs_unigram, stop_index))
        probs = self.lambda_1*probs_bigram + self.lambda_2*probs_unigram  # Get the interpolation probability

        word_list = sorted(self.word_dict.items(), key=lambda item: item[1])
        word_list = [k[0] for k in word_list]

        return np.random.choice(word_list, p=probs)

Esempio n. 8

0

Mostra file

def main():
    bg = Bigram()
    bg.train()
    print(sys.argv[1])
    p, q, r = bg.test(sys.argv[1])
    print("------Unsmooth Probability---------")
    print('{:.60f}'.format(p))
    print("------Laplace Smooth Prob---------")
    print('{:.60f}'.format(q))
    print("------Good Turing Prob---------")
    print('{:.60f}'.format(r))

Esempio n. 9

0

Mostra file

File: markov.py Progetto: joelburton/pyplay

def make_chains(in_string):
    """Make markov chains from text in in_string."""

    chains = defaultdict(list)
    words = in_string.strip().split()

    for i in range(len(words) - 2):
        chains[Bigram(words[i], words[i + 1])].append(words[i + 2])

    # Add marker for end-of-text
    chains[Bigram(words[i + 1], words[i + 2])].append(None)   

    return chains

Esempio n. 10

0

Mostra file

File: huntag.py Progetto: zbxzc35/HunTag

def main_bigramTrain(options, input):
    bigramModel = Bigram(0.000000000000001)
    for sen, _ in sentenceIterator(input):
        tags = [tok[options.tagField] for tok in sen]
        bigramModel.obsSequence(tags)
    bigramModel.count()
    bigramModel.writeToFile(options.bigramModelFile)

Esempio n. 11

0

Mostra file

File: huntag.py Progetto: gabor-recski/HunTag

def main_bigramTrain(options, input):
    bigramModel = Bigram(0.000000000000001)
    for sen, _ in sentenceIterator(input):
        tags = [tok[options.tagField] for tok in sen]
        bigramModel.obsSequence(tags)
    bigramModel.count()
    bigramModel.writeToFile(options.bigramModelFile)

Esempio n. 12

0

Mostra file

class Interpolation(LanguageModel):
    def __init__(self):
        self.unigram_model = Unigram()
        self.bigram_model = Bigram()
        self.trigram_model = Trigram()
        self.unigram_lambda = .25
        self.bigram_lambda = .25
        self.trigram_lambda = .5

    def train(self, trainingSentences):
        self.unigram_model.train(trainingSentences)
        self.bigram_model.train(trainingSentences)
        self.trigram_model.train(trainingSentences)

    #Arbitrary lambdas.
    def getWordProbability(self, sentence, index):
        return (self.trigram_lambda * self.trigram_model.getWordProbability(sentence, index)) \
               + (self.bigram_lambda * self.bigram_model.getWordProbability(sentence, index)) \
               + (self.unigram_lambda * self.unigram_model.getWordProbability(sentence, index))

    #Doesn't matter which model we use here- vocabulary is the same
    def getVocabulary(self, context):
        return self.trigram_model.getVocabulary(context)

    #What does generating a sentence in an interpolation model look like?
    #I don't know, so what I've done is generate a word using trigram, bigram, and
    #unigram model some of the time, using the same values in getWordProbability
    def generateSentence(self):
        sentence = []
        prev_previous = LanguageModel.START
        previous = random.choice(list(self.trigram_model.word_count.keys()))
        for i in range(20):
            model_choice = random.random()
            if model_choice <= self.trigram_lambda:
                word = self.trigram_model.generateWord(prev_previous, previous)
            elif model_choice > self.trigram_lambda and model_choice <= self.trigram_lambda + self.bigram_lambda:
                word = self.bigram_model.generate_word(previous)
            else:
                word = self.unigram_model.generateWord()
            sentence.append(word)
            prev_previous = previous
            previous = word
            if word == LanguageModel.STOP:
                break
        return sentence

Esempio n. 13

0

Mostra file

File: tagger.py Progetto: kszucs/HunTag

 def __init__(self, featureSet, options):
     self.featureSet = featureSet
     self.params = '-b 1'
     self.lmw = options['lmw']
     modelName = options['modelName']
     sys.stderr.write('loading transition model...')
     self.transProbs = Bigram.getModelFromFile(options['bigramModelFile'])
     sys.stderr.write('done\nloading observation model...')
     self.model = load_model('{0}.model'.format(modelName))
     self.labelCounter = options['labelCounter']
     self.featCounter = options['featCounter']
     sys.stderr.write('done\n')

Esempio n. 14

0

Mostra file

 def __init__(self, featureSet, options):
     self.featureSet = featureSet
     self.params = '-b 1'
     self.lmw = options['lmw']
     modelName = options['modelName']
     sys.stderr.write('loading transition model...')
     self.transProbs = Bigram.getModelFromFile(options['bigramModelFile'])
     sys.stderr.write('done\nloading observation model...')
     self.model = load_model('{0}.model'.format(modelName))
     self.labelCounter = options['labelCounter']
     self.featCounter = options['featCounter']
     sys.stderr.write('done\n')

Esempio n. 15

0

Mostra file

File: markov.py Progetto: joelburton/pyplay

def make_text(chains):
    """Generate markov-chain-generated text from chains."""

    bigram = choice(list(chains))
    print (bigram, end=' ')

    while True:
        follows = choice(chains[bigram])
        if follows is None:
            break
        print(follows, end=' ')
        bigram = Bigram(bigram.word2, follows)

    print()

Esempio n. 16

0

Mostra file

 def get_bigrams(self):
     bigram_list = []
     for bigram in nltk.bigrams(self.low_case_words_list):
         bigram_list.append(Bigram(self.doc_id, bigram))
     return bigram_list

Esempio n. 17

0

Mostra file

from hmm import HMM
from bigram import Bigram
import time
import re
import codecs

if __name__ == "__main__":

    print('将对配置文件config.py中TESTFILE=' + TESTFILE + '进行分词...')
    start = time.time()

    with codecs.open(TESTFILE, 'r', 'gbk') as f:
        text = f.read()
    lines = text.split('\r\n')

    bigram = Bigram()
    lines_segs = []
    # 逐行进行分词
    for i, line in enumerate(lines):
        line_segs = []
        if line != '':
            ###### 将每行从传入分词模型 #####
            line_segs = bigram.cut(line)
            ###############################
        else:
            line_segs = line

        lines_segs.append(line_segs)
        # 没千行打印耗时
        if i % 1000 == 0:
            print(str(i) + '/' + str(len(lines)), time.time() - start)

Esempio n. 18

0

Mostra file

File: problem5.py Progetto: samanthaks/ML

    ratios = np.arange(0.05, 1.05, 0.05)
    unigram_accuracies = []
    tfidf_accuracies = []
    bigram_accuracies = []
    for r in ratios:
        unigram_perceptron = Unigram(train_ratio=r)
        unigram_accuracy = unigram_perceptron.accuracy
        unigram_accuracies.append(unigram_accuracy)
        print(r, "unigram_perceptron", unigram_accuracy)

        tfidf_perceptron = Tfidf(train_ratio=r)
        tfidf_accuracy = tfidf_perceptron.accuracy
        tfidf_accuracies.append(tfidf_accuracy)
        print(r, "tfidf_perceptron", tfidf_accuracy)

        bigram_perceptron = Bigram(train_ratio=r)
        bigram_accuracy = bigram_perceptron.accuracy
        bigram_accuracies.append(bigram_accuracy)
        print(r, "bigram_perceptron", bigram_accuracy)

    pickle.dump(unigram_accuracies, open("unigram_accuracies.pkl", "wb"))
    pickle.dump(tfidf_accuracies, open("tfidf_accuracies.pkl", "wb"))
    pickle.dump(bigram_accuracies, open("bigram_accuracies.pkl", "wb"))
    # unigram_accuracies = pickle.load(open("unigram_accuracies.pkl", "rb"))
    # tfidf_accuracies = pickle.load(open("tfidf_accuracies.pkl", "rb"))
    # bigram_accuracies = pickle.load(open("bigram_accuracies.pkl", "rb"))

    fig = plt.figure()
    ax1 = fig.add_subplot(111)
    num_samples = ratios * 1000000
    ax1.scatter(num_samples, unigram_accuracies, c='b', label='Unigrams')

Esempio n. 19

0

Mostra file

from unigram import Unigram
from bigram import Bigram
from trigram import Trigram

inputs = read('input.txt')[0].strip().split(" ")
V, N, S_FACTOR, TRAINING_FILE, TEST_FILE = (int(inputs[0]), int(inputs[1]),
                                            float(inputs[2]), inputs[3],
                                            inputs[4])
OUTPUT_FILE_NAME = f"./results/trace_{V}_{N}_{S_FACTOR}.txt"

t1 = time()
if V == 3:
    print(f"BYOM: V = {V} n = 3 d = {S_FACTOR}")
    BYOM = BYOM(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BYOM.execute()
elif N == 1:
    print(f"unigram: V = {V} d = {S_FACTOR}")
    UNIGRAM = Unigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    UNIGRAM.execute()
elif N == 2:
    print(f"bigram: V = {V} d = {S_FACTOR}")
    BIGRAM = Bigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    BIGRAM.execute()
elif N == 3:
    print(f"trigram: V = {V} d = {S_FACTOR}")
    TRIGRAM = Trigram(V, S_FACTOR, TRAINING_FILE, TEST_FILE, OUTPUT_FILE_NAME)
    TRIGRAM.execute()
t2 = time()

print(f"execution time: {t2 - t1}s")

Esempio n. 20

0

Mostra file

File: test.py Progetto: Elixeus/NLP

from bigram import Bigram
import os
import re

if __name__ == '__main__':
    bg = Bigram()
    bg.train(os.path.abspath('../darksouls_training.txt'))
    print 'model trained'
    # for key, item in bg.get_model().iteritems():
    #     print key, item
    bg.test('../darksouls_test.txt')
    print 'The entropy for the test set is: {:.2f}.'.format(bg.entropy)
    print 'The perplexity for the test set is: {:.2f}.'.format(bg.perplexity)

Esempio n. 21

0

Mostra file

File: bigram_interpolation.py Progetto: germing/Bigram-language-model-with-varient-smoothing

 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()
     self.coef = 0.5
     print("W(bigram):W(unigram) coefficient is 1 :", self.coef)

Esempio n. 22

0

Mostra file

 def __init__(self):
     self.unigram = Unigram()
     self.bigram = Bigram()

Esempio n. 23

0

Mostra file

class RetrievalIndex:
    def __init__(self):
        self.docs = dict()
        self.index = dict()
        self.vecs = None
        self.consts = None
        self.modified = False
        self.bigram_index = Bigram()

    def save(self, file_path):
        with open(file_path, 'wb') as f:
            pickle.dump(self, f)

    @classmethod
    def load(cls, file_path):
        with open(file_path, 'rb') as f:
            index = pickle.load(f)
        return index

    def __getstate__(self):
        return self.__dict__

    def __setstate__(self, d):
        self.__dict__ = d

    @classmethod
    def from_xml(cls, xml, max_num=None, method='file'):
        index = cls()
        for doc in Doc.create_list_from_xml(xml,
                                            max_num=max_num,
                                            method=method):
            index.add_doc(doc)
        return index

    def add_doc(self, doc, raise_on_exists=True):

        self.set_modified()
        doc_id = doc.doc_id

        if doc_id in self.docs:
            if raise_on_exists:
                raise ValueError("Doc already in list, change id")
            else:
                return

        self.docs[doc_id] = doc
        for word, position, doc_part in doc.info_iterator:
            self.word_index_add_doc(word, position, doc_id, doc_part)

        #bigram
        for word in doc.bigram_words:
            self.bigram_index.add_word(word)

    def remove_doc(self, doc_id, raise_on_not_exists=True):

        self.set_modified()
        self.modified = True
        if doc_id not in self.docs:
            if raise_on_not_exists:
                raise ValueError("doc_id not found")
            else:
                return

        doc = self.docs[doc_id]
        for word, position, doc_part in doc.info_iterator:
            self.word_index_remove_doc(word, doc_id)
        del self.docs[doc_id]

        #bigram
        for word in doc.bigram_words:
            self.bigram_index.remove_word(word)

    def word_index_add_doc(self, word, position, doc_id, doc_part):
        self.index.setdefault(word, {}).setdefault(doc_id, {}).setdefault(
            doc_part, []).append(position)

    #assumes no Attack
    def word_index_remove_doc(self, word, doc_id, raise_on_not_exists=False):

        posting_list = self.get_posting_list(
            word, raise_on_not_exists=raise_on_not_exists)
        if doc_id not in posting_list:
            if raise_on_not_exists:
                raise ValueError("Doc %s not in posting list for word %s" %
                                 (doc_id, word))
            return

        del posting_list[doc_id]
        if not posting_list:
            del self.index[word]

    def get_posting_list(self, word, raise_on_not_exists=True):

        if raise_on_not_exists and word not in self.index:
            raise ValueError('term not in index')

        return self.index.get(word, {})

    def tf(self, term, doc_id, part):

        posting_list = self.get_posting_list(term)
        tf = len(posting_list.get(doc_id, {}).get(part, {}))

        return Tf_calc.transform_tf(tf)

    def idf(self, term, part):

        df = len(self.get_posting_list(term))
        return Tf_calc.idf_transform(df, self.N)

    def tf_idf(self, term, doc_id, part):
        return self.tf(term, doc_id, part) * self.idf(term, part)

    def get_exact_docs(self, li_title, li_text, method="standard"):

        if method == "standard":
            li = li_text

            def is_fine(doc_id):
                return all(self.docs[doc_id].has_exact(phrase)
                           for phrase in li)
        else:
            raise ValueError("method must be standard")

        ans = list(filter(is_fine, self.docs.keys()))
        return ans

    def query(self,
              query_title,
              query_text,
              should_divide=False,
              k=15,
              title_ratio=2,
              flatten=True,
              exact_method="standard"):

        query_title, li_title = Text_cleaner.query_cleaner(query_title)
        query_text, li_text = Text_cleaner.query_cleaner(query_text)

        query = Doc.from_query(query_title, query_text)

        good_doc_ids = self.get_exact_docs(li_title, li_text, exact_method)

        self.make_vectors()
        v, const = query.tf_idf()
        scores = []
        for doc_id, doc_v in self.vecs.items():
            if doc_id not in good_doc_ids:
                continue
            part_score = dict()
            for part in doc_v:
                part_score[part] = 0
                for term, w_q in v[part].items():
                    part_score[part] += doc_v[part].get(term, 0) * w_q

                if should_divide:
                    modified_vector = {
                        term: doc_v['text'].get(term, 0)
                        for term in v[part].keys()
                    }
                    new_contant = Tf_calc.const(modified_vector)
                    normalization_factor = new_contant * const[part] + EPSILON
                else:
                    normalization_factor = 1

                part_score[part] /= normalization_factor

            final_score = part_score['title'] * title_ratio + part_score['text']
            scores.append((doc_id, final_score))

        scores.sort(key=lambda x: x[1], reverse=True)
        top_k = [scores[i][0] for i in range(min(k, len(scores)))]
        if k == 1 and flatten:
            return top_k[0]
        else:
            return top_k

    def make_vectors(self):

        if not self.modified:
            return

        self.vecs = {}
        self.consts = {}
        for doc_id, doc in self.docs.items():
            self.vecs[doc_id] = {}
            self.consts[doc_id] = {}
            for part in Doc.PARTS:
                v = dict()
                for term in doc.distinct_terms(part):
                    v[term] = self.tf_idf(term, doc_id, part)

                self.vecs[doc_id][part] = v
                self.consts[doc_id][part] = Tf_calc.const(v)

        self.modified = False

    def set_modified(self):
        self.modified = True

    @property
    def N(self):
        return len(self.docs)

    def __str__(self):
        ans = ""
        ans += "Doc_ids: %s\n" % str(list(self.docs))
        ans += '+++++++++++++++++++++\n'
        ans += "Index: %s\n" % '\n------------\n'.join("%s: %s" %
                                                       (word, self.index[word])
                                                       for word in self.index)
        return ans

Esempio n. 24

0

Mostra file

File: hmm.py Progetto: iseesaw/Pinyin2ChineseChars

 def __init__(self):
     self.load_param()
     self.bigram = Bigram()

Esempio n. 25

0

Mostra file

File: train_bigram.py Progetto: Nirperm/MachineLearning_Python

    def train(self):
        bi_diff_word_dict = {}
        u_count_dict = {}
        count_word_dict = {}
        for word_line in self.word:
            count = 1
            while len(word_line) > count:
                word_dict = Bigram(self.word_dict, word_line[count])
                self.word_dict = word_dict.dict_bigram()

                count_word_dict = Bigram(count_word_dict, word_line[count - 1])
                count_word_dict = count_word_dict.dict_bigram()

                context_count_dict = Bigram(self.context_count_dict, '')
                self.context_count_dict = context_count_dict.dict_bigram()

                bi_word = word_line[count - 1] + ' ' + word_line[count]
                word_dict = Bigram(self.word_dict, bi_word)
                self.word_dict = word_dict.dict_bigram()
                bi_diff_word_dict.update({bi_word: word_line[count - 1]})

                context_count_dict = Bigram(self.context_count_dict, word_line[count - 1])
                self.context_count_dict = context_count_dict.dict_bigram()

                count = count + 1

        # Witten Bell Smoothing
        for k, v in bi_diff_word_dict.items():
            if v in u_count_dict:
                bi_value = u_count_dict[v]
                u_count_dict.update({v: bi_value + 1})
            else:
                u_count_dict[v] = 1

        for k in count_word_dict.keys():
            lambda_w = 1 - (1.0 * u_count_dict[k] / (u_count_dict[k] + count_word_dict[k]))
            self.lambda_word_dict.update({k: lambda_w})

        for ngram, count in self.word_dict.items():
            n_split_word = ngram.split(' ')
            n_split_word.pop()
            context = ''.join(n_split_word)
            prob = 1.0 * self.word_dict[ngram] / self.context_count_dict[context]
            self.word_dict.update({ngram: prob})

Esempio n. 26

0

Mostra file

import numpy as np
import csv
from bigram import Bigram

ham_validation_list = Bigram.processed_file("../ham/validation.txt")
spam_validation_list = Bigram.processed_file("../spam/validation.txt")

with open('eval_k_0.1_1.0.csv', mode='w') as report_file:
    report_writer = csv.writer(report_file,
                               delimiter=',',
                               quotechar='"',
                               quoting=csv.QUOTE_MINIMAL)
    report_writer.writerow([
        'k', 'ham accuracy', 'spam accuracy', 'precision', 'recall', 'F1_score'
    ])

    for k in np.linspace(0.1, 1.0, 10):
        ham_pro_dict = Bigram.bigram_probability_dict("../ham/train.txt", k)
        spam_pro_dict = Bigram.bigram_probability_dict("../spam/train.txt", k)

        ham_count_ham = ham_count_spam = 0
        for message in ham_validation_list:
            spam_pp = Bigram.perplexity_single_message(spam_pro_dict, message)
            ham_pp = Bigram.perplexity_single_message(ham_pro_dict, message)
            result = None
            if spam_pp > ham_pp:
                ham_count_ham += 1
            else:
                ham_count_spam += 1

        spam_count_ham = spam_count_spam = 0

Esempio n. 27

0

Mostra file

File: hmm.py Progetto: iseesaw/Pinyin2ChineseChars

class HMM:
    def __init__(self):
        self.load_param()
        self.bigram = Bigram()

    def load_param(self):
        self.init_prob = self.read('init_prob')
        self.emiss_prob = self.read('emiss_prob')
        self.trans_prob = self.read('trans_prob')
        self.pinyin_states = self.read('pinyin_states')

    def read(self, filename):
        with open('model_params/' + filename + '.json', 'r') as f:
            return json.load(f)

    # Viterbi process
    def trans(self, strs):

        # 切分
        seq = self.bigram.dp_search(strs)

        # smooth
        self.min_f = -3.14e+100
        length = len(seq)

        viterbi = {}
        for i in range(length):
            viterbi[i] = {}

        # initize
        for s in self.pinyin_states.get(seq[0]):
            viterbi[0][s] = (
                self.init_prob.get(s, self.min_f) +
                self.emiss_prob.get(s, {}).get(seq[0], self.min_f) +
                self.trans_prob.get(s, {}).get('BOS', self.min_f), -1)

        # DP
        # look trans_prob = {post1:{pre1:p1, pre2:p2}, post2:{pre1:p1, pre2:p2}}
        for i in range(length - 1):
            for s in self.pinyin_states.get(seq[i + 1]):
                viterbi[i + 1][s] = max([
                    (viterbi[i][pre][0] +
                     self.emiss_prob.get(s, {}).get(seq[i + 1], self.min_f) +
                     self.trans_prob.get(s, {}).get(pre, self.min_f), pre)
                    for pre in self.pinyin_states.get(seq[i])
                ])

        for s in self.pinyin_states.get(seq[-1]):
            viterbi[length -
                    1][s] = (viterbi[length - 1][s][0] +
                             self.trans_prob.get('EOS', {}).get(s, self.min_f),
                             viterbi[length - 1][s][1])

        words = [None] * length

        words[-1] = max(viterbi[length - 1], key=viterbi[length - 1].get)

        for n in range(length - 2, -1, -1):
            words[n] = viterbi[n + 1][words[n + 1]][1]

        return ''.join(w for w in words)

Esempio n. 28

0

Mostra file

File: bigram_interpolation.py Progetto: germing/Bigram-language-model-with-varient-smoothing

class BigramInterpolation(LanguageModel):
    def __init__(self):
        self.unigram = Unigram()
        self.bigram = Bigram()
        self.coef = 0.5
        print("W(bigram):W(unigram) coefficient is 1 :", self.coef)

    def train(self, trainingSentences):
        self.unigram.train(trainingSentences)
        self.bigram.train(trainingSentences)

    def getWordProbability(self, sentence, index):
        coef = self.coef
        x = 1 / (1 + coef)

        if index == len(sentence):
            word = LanguageModel.STOP
            prev_word = sentence[-1]
        elif index == 0:
            word = sentence[0]
            prev_word = LanguageModel.START
        else:
            word = sentence[index]
            prev_word = sentence[index - 1]

        if prev_word not in self.bigram.probCounter:
            prev_word = LanguageModel.UNK

        if self.bigram.probCounter[prev_word][word] == 0:
            return x * coef * self.unigram.getWordProbability(sentence, index)
        else:
            return x * self.bigram.getWordProbability(
                sentence, index) + x * coef * self.unigram.getWordProbability(
                    sentence, index)

    def getVocabulary(self, context):

        next_posb_word = []
        # append all possible word except START in self.total
        for next_word in self.bigram.total:
            if next_word != LanguageModel.START:
                next_posb_word.append(next_word)
        # append STOP manually since there is no STOP in self.total
        next_posb_word.append(LanguageModel.STOP)

        return next_posb_word

    def generateWord(self, context):

        return self.bigram.generateWord(context)

    def generateSentence(self):
        result = []
        # limit sentence length to 20
        for i in range(20):
            word = LanguageModel.UNK
            while word == LanguageModel.UNK:
                # make sure word != UNK
                word = self.generateWord(result)
            result.append(word)
            if word == LanguageModel.STOP:
                break
        return result