Beispiel #1
0
    def good_turing_smoothing(self, emission, transition):
        transition_probability = {}
        emission_probability = {}

        for key in emission:
            fd = FreqDist(emission[key])
            emission_probability[key] = SimpleGoodTuringProbDist(fd)

        for key in transition:
            fd = FreqDist(transition[key])
            transition_probability[key] = SimpleGoodTuringProbDist(fd)

        return emission_probability, transition_probability
Beispiel #2
0
def create_entropy():

    conn = sqlite3.connect("chappies_brain.db")
    c = conn.cursor()

    file_str = []

    for row in c.execute('SELECT corpus FROM corpus'):
        kette = row[0].rstrip()
        kette = row[0].split(
            " "
        )  #ToDO: sonderzeichen entfernen damit einzelnes wort erkannt und berechnet wird
        for i in kette:
            file_str.append(i)

    countingWords = Counter(file_str)
    from nltk import SimpleGoodTuringProbDist, FreqDist
    fd = FreqDist(countingWords)
    p = SimpleGoodTuringProbDist(fd)

    #print countingWords
    '''gesamte Laenge des Corpus '''
    laenge = len(countingWords)
    #print "Corpuslaenge: ",laenge
    #print(countingWords)
    #print(round(p.prob('this'), 4))
    for i in fd:
        entropy = (round(p.prob(i), 7))
        try:
            c.execute("INSERT INTO entropy VALUES (?,?)", [i, entropy])
        except:
            pass
    '''berechnung der Wortwahrscheinlichkeiten und der Entropy'''
    '''
	time.sleep(5000)
	for key,value in countingWords.items():
		#print key,value
		wahrscheinlichkeit = float(value)/float(laenge)
		#print "Wort: ", key, "| Worthaeufigkeit: ",value, "| Wahrscheinlichkeit: ", wahrscheinlichkeit
	
		entropy = wahrscheinlichkeit*math.log(wahrscheinlichkeit)*-1
		print("Entropy = ", entropy)
		
		
		#try:
		#	c.execute("INSERT INTO entropy VALUES (?,?)",[key,entropy])
		#except:
		#	pass
	'''
    conn.commit()
    conn.close()
Beispiel #3
0
def getTransitionProb(sm, sents, tagset):
    # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag)
    transition = []
    for s in sents:
        tags = [t for (w, t) in s]
        transition += ngrams(tags, 2)

    transitionProb = {}
    for tag in tagset:
        nextTags = [
            nextTag for (prevTag, nextTag) in transition if prevTag == tag
        ]

        if sm == "no":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   0,
                                                   bins=1e5)
        elif sm == "laplace":
            transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags),
                                                   1,
                                                   bins=1e5)
        elif sm == "goodturing":
            transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags),
                                                           bins=1e5)
        else:
            transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags),
                                                     bins=1e5)

    return transitionProb
def calculate_probability():
	conn = sqlite3.connect("chappies_brain.db")
	c = conn.cursor()
	
	kette_dict = {}
	sammler = []
	marcov_chain = []
	
	values = []
	
	''' hole zahl aller elemente in db'''
	for row in c.execute('SELECT  count(*) FROM corpus'):
		laenge = row[0]
		
	'''count vorkommen einer kette'''
	for row in c.execute('SELECT  corpus, count(*) FROM corpus group by corpus'):
		kette_dict[row[0]] = row[1]
		#probability = float(row[1])/float(laenge)
		#print(row[0],probability)
		#values.append((row[0],probability))
	conn.commit()
	conn.close()
	
	#print(kette_dict)
	fd = FreqDist(kette_dict)
	print(fd)
	p = SimpleGoodTuringProbDist(fd)
	print("Run Simple Good Turing",fd)
	#time.sleep(5000)
	for i in fd:
		#print(i)
		pk = p.prob(i)
		values.append((i,pk))
		
		#c.execute("INSERT INTO kette VALUES (?,?)",[i,pk])

	drop_probability()
	conn = sqlite3.connect("chappies_brain.db")
	c = conn.cursor()	
	'''einfuegen der wahrscheinlichkeiten in DB'''
	'''executemany schreibt die ganze liste auf einmal. keine iteration noetig'''
	c.executemany("INSERT OR IGNORE INTO kette VALUES (?,?) ",values)

	conn.commit()
	conn.close()
Beispiel #5
0
def getEmissionProb(sm, sents, tagset):
    # P(word|tag) = transitionProb[tag].prob(word)
    emission = []
    for s in sents:
        emission += [(w.lower(), t) for (w, t) in s]

    emissionProb = {}
    for tag in tagset:
        words = [w for (w, t) in emission if t == tag]
        if sm == "no":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5)
        elif sm == "laplace":
            emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5)
        elif sm == "goodturing":
            emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words),
                                                         bins=1e5)
        else:
            emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5)

    return emissionProb
#https://github.com/maxbane/simplegoodturing/blob/master/sgt.py

from nltk import SimpleGoodTuringProbDist, FreqDist

fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10})

p = SimpleGoodTuringProbDist(fd)
p.prob('a')
print(getattr(p))
def good_turing_trigram_model(data):
    trigram_distribution = FreqDist(data)
    good_turing_trigram = SimpleGoodTuringProbDist(trigram_distribution)
    return good_turing_trigram
Beispiel #8
0
mle = lambda fd, bins: MLEProbDist(fd)
print(train_and_test(mle))
print(train_and_test(LaplaceProbDist))
print(train_and_test(ELEProbDist))


def lidstone(gamma):
    return lambda fd, bins: LidstoneProbDist(fd, gamma, bins)


print(train_and_test(lidstone(0.1)))
print(train_and_test(lidstone(0.5)))
print(train_and_test(lidstone(1.0)))
# witten bell estimation
print(train_and_test(WittenBellProbDist))
gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5)
print(train_and_test(gt))
# kneser ney estimation
corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1]))
           for x, y, z in nltk.trigrams(sent)] for sent in corpus[:100]]
tag_set = unique_list(tag for sent in corpus for (word, tag) in sent)
print(len(tag_set))
symbols = unique_list(word for sent in corpus for (word, tag) in sent)
print(len(symbols))
trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
train_corpus = []
test_corpus = []
for i in range(len(corpus)):
    if i % 10:
        train_corpus += [corpus[i]]
    else: