def good_turing_smoothing(self, emission, transition): transition_probability = {} emission_probability = {} for key in emission: fd = FreqDist(emission[key]) emission_probability[key] = SimpleGoodTuringProbDist(fd) for key in transition: fd = FreqDist(transition[key]) transition_probability[key] = SimpleGoodTuringProbDist(fd) return emission_probability, transition_probability
def create_entropy(): conn = sqlite3.connect("chappies_brain.db") c = conn.cursor() file_str = [] for row in c.execute('SELECT corpus FROM corpus'): kette = row[0].rstrip() kette = row[0].split( " " ) #ToDO: sonderzeichen entfernen damit einzelnes wort erkannt und berechnet wird for i in kette: file_str.append(i) countingWords = Counter(file_str) from nltk import SimpleGoodTuringProbDist, FreqDist fd = FreqDist(countingWords) p = SimpleGoodTuringProbDist(fd) #print countingWords '''gesamte Laenge des Corpus ''' laenge = len(countingWords) #print "Corpuslaenge: ",laenge #print(countingWords) #print(round(p.prob('this'), 4)) for i in fd: entropy = (round(p.prob(i), 7)) try: c.execute("INSERT INTO entropy VALUES (?,?)", [i, entropy]) except: pass '''berechnung der Wortwahrscheinlichkeiten und der Entropy''' ''' time.sleep(5000) for key,value in countingWords.items(): #print key,value wahrscheinlichkeit = float(value)/float(laenge) #print "Wort: ", key, "| Worthaeufigkeit: ",value, "| Wahrscheinlichkeit: ", wahrscheinlichkeit entropy = wahrscheinlichkeit*math.log(wahrscheinlichkeit)*-1 print("Entropy = ", entropy) #try: # c.execute("INSERT INTO entropy VALUES (?,?)",[key,entropy]) #except: # pass ''' conn.commit() conn.close()
def getTransitionProb(sm, sents, tagset): # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag) transition = [] for s in sents: tags = [t for (w, t) in s] transition += ngrams(tags, 2) transitionProb = {} for tag in tagset: nextTags = [ nextTag for (prevTag, nextTag) in transition if prevTag == tag ] if sm == "no": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 0, bins=1e5) elif sm == "laplace": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 1, bins=1e5) elif sm == "goodturing": transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags), bins=1e5) else: transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags), bins=1e5) return transitionProb
def calculate_probability(): conn = sqlite3.connect("chappies_brain.db") c = conn.cursor() kette_dict = {} sammler = [] marcov_chain = [] values = [] ''' hole zahl aller elemente in db''' for row in c.execute('SELECT count(*) FROM corpus'): laenge = row[0] '''count vorkommen einer kette''' for row in c.execute('SELECT corpus, count(*) FROM corpus group by corpus'): kette_dict[row[0]] = row[1] #probability = float(row[1])/float(laenge) #print(row[0],probability) #values.append((row[0],probability)) conn.commit() conn.close() #print(kette_dict) fd = FreqDist(kette_dict) print(fd) p = SimpleGoodTuringProbDist(fd) print("Run Simple Good Turing",fd) #time.sleep(5000) for i in fd: #print(i) pk = p.prob(i) values.append((i,pk)) #c.execute("INSERT INTO kette VALUES (?,?)",[i,pk]) drop_probability() conn = sqlite3.connect("chappies_brain.db") c = conn.cursor() '''einfuegen der wahrscheinlichkeiten in DB''' '''executemany schreibt die ganze liste auf einmal. keine iteration noetig''' c.executemany("INSERT OR IGNORE INTO kette VALUES (?,?) ",values) conn.commit() conn.close()
def getEmissionProb(sm, sents, tagset): # P(word|tag) = transitionProb[tag].prob(word) emission = [] for s in sents: emission += [(w.lower(), t) for (w, t) in s] emissionProb = {} for tag in tagset: words = [w for (w, t) in emission if t == tag] if sm == "no": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5) elif sm == "laplace": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5) elif sm == "goodturing": emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words), bins=1e5) else: emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return emissionProb
#https://github.com/maxbane/simplegoodturing/blob/master/sgt.py from nltk import SimpleGoodTuringProbDist, FreqDist fd = FreqDist({'a':1, 'b':1, 'c': 2, 'd': 3, 'e': 4, 'f': 4, 'g': 4, 'h': 5, 'i': 5, 'j': 6, 'k': 6, 'l': 6, 'm': 7, 'n': 7, 'o': 8, 'p': 9, 'q': 10}) p = SimpleGoodTuringProbDist(fd) p.prob('a') print(getattr(p))
def good_turing_trigram_model(data): trigram_distribution = FreqDist(data) good_turing_trigram = SimpleGoodTuringProbDist(trigram_distribution) return good_turing_trigram
mle = lambda fd, bins: MLEProbDist(fd) print(train_and_test(mle)) print(train_and_test(LaplaceProbDist)) print(train_and_test(ELEProbDist)) def lidstone(gamma): return lambda fd, bins: LidstoneProbDist(fd, gamma, bins) print(train_and_test(lidstone(0.1))) print(train_and_test(lidstone(0.5))) print(train_and_test(lidstone(1.0))) # witten bell estimation print(train_and_test(WittenBellProbDist)) gt = lambda fd, bins: SimpleGoodTuringProbDist(fd, bins=1e5) print(train_and_test(gt)) # kneser ney estimation corpus = [[((x[0], y[0], z[0]), (x[1], y[1], z[1])) for x, y, z in nltk.trigrams(sent)] for sent in corpus[:100]] tag_set = unique_list(tag for sent in corpus for (word, tag) in sent) print(len(tag_set)) symbols = unique_list(word for sent in corpus for (word, tag) in sent) print(len(symbols)) trainer = nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols) train_corpus = [] test_corpus = [] for i in range(len(corpus)): if i % 10: train_corpus += [corpus[i]] else: