def init_GT_smoothing(self): self.nr = {} for i, state in self.state_counts: # Coincedence frequency counts c_nr = {} # and bigram frequency counts bigram_nr = {} for obs in self.observations: if (i, obs, state) in self.coincedences: c_nr[self.coincedences[i, obs, state]] = c_nr.get(self.coincedences[i, obs, state], 0) + 1 unseen_bigrams = 0 for j, state1 in self.state_counts: if j != i: continue if (i, state, state1) in self.bigrams: bigram_nr[self.bigrams[i, state, state1]] = bigram_nr.get(self.bigrams[i, state, state1], 0) + 1 else: unseen_bigrams += 1 x,y = [1], [self.unseen_coincedences * len(self.states)] for n,count in c_nr.items(): x.append(math.log(n+1)) y.append(count) # Find a least squares fit to of the frequency counts to nr = a + b*log(x) # The fitted functions sometimes do dive under zero! (Which doesn't seem to be good) # By adding a very large value that is zero we sort of solve this? #(a,b) = y[0], 0 x.append(math.log(100000)) y.append(0) c_nr[0] = self.unseen_coincedences * len(self.states) (a,b) = tools.linear_fit(x,y) self.nr[(i, state), 'coincedence'] = (a,b, c_nr) p,q = [0], [unseen_bigrams] for n,count in bigram_nr.items(): p.append(math.log(n)) q.append(count+1) # Find a least squares fit to of the frequency counts to nr = a + b*log(x) # The fitted functions sometimes do dive under zero! #(a,b) = y[0], 0 p.append(math.log(100)) q.append(0) bigram_nr[0] = unseen_bigrams #print '{0}: {1}'.format((i, state), bigram_nr) (a,b) = tools.linear_fit(p,q) self.nr[(i, state), 'bigram'] = (a,b, bigram_nr)
def init_GT_smoothing(self): self.nr = {} undef = 0 for state in self.states: nr = {} unseen = self.unseen_coincedences for obs in self.observations: if (obs, state) in self.coincedences: nr[self.coincedences[obs, state]] = nr.get(self.coincedences[obs, state], 0) + 1 else: unseen += 1 # We can't use these if len(nr) == 1: self.nr[state] = 'undefined' undef += 1 continue # Things that never occur # Variable should be set when decoding is called x, y = [], [] for n,count in nr.items(): x.append(math.log(n)) y.append(math.log(count)) if len(x) < 2: x.append(math.log(n+1)) y.append(math.log(1)) #x.append(math.log(max(nr.keys())+100)) #y.append(1) #nr[0] = self.unseen_coincedences * len(self.states) #print state, x, y, nr # Find a least squares fit to of the frequency counts to nr = a + b*log(x) #(a,b) = y[0], 0 #if len(x) > 1: (a,b) = tools.linear_fit(x,y) #if len(nr) == 1: # print '{0} unseen: {1} Nr: {2}'.format(state, unseen, nr) if len(nr) > 5: import matplotlib.pyplot as plt print([nr[i] for i in list(nr.keys())]) print([math.exp(a+b*math.log(i+1)) for i in list(nr.keys())]) #plt.plot([1] + [i+1 for i in sorted(nr.keys())], [math.exp(i) for i in y]) #plt.show() #raw_input("Press enter to continue...") #plt.plot(range(1, 1000), [math.exp(a+b*math.log(i)) for i in range(1, 1000)]) #raw_input("Press enter to continue...") self.nr[state] = (a,b,unseen) print("Smoothing disabled on {0} out of {1} states".format(undef, len(self.states)))