def __init__(self): self.pi = [] self.transitions = dd(list) self.emissions = [] self.states = [] self.nstates = 0 self.guesser = Guesser(10) self.vocab = set()
class Learner: def __init__(self): self.pi = [] self.transitions = dd(list) self.emissions = [] self.states = [] self.nstates = 0 self.guesser = Guesser(10) self.vocab = set() def train(self, sentences): for sentence in sentences: self.guesser.loadWords(sentence) self.vocab.add(sentence[0][0]) prevTag = sentence[0][1] word = sentence[0][0] firstState = -1 try: firstState = self.states.index(prevTag) except: firstState = len(self.states) self.pi.append(1) for state in range(len(self.states)): self.transitions[state].append(0) self.states.append(prevTag) self.nstates += 1 self.transitions[firstState] = [0] * len(self.states) self.emissions.append({}) self.pi[firstState] += 1 self.emissions[firstState][word] = self.emissions[firstState].get(word, 0) + 1 for word, tag in sentence[1:]: self.vocab.add(word) state = -1 try: state = self.states.index(tag) except: self.pi.append(0) state = len(self.states) for s in range(len(self.states)): self.transitions[s].append(0) self.states.append(tag) self.nstates += 1 self.transitions[state] = [0] * len(self.states) self.emissions.append({}) self.emissions[state][word] = self.emissions[state].get(word, 0) + 1 self.transitions[self.states.index(prevTag)][state] += 1 prevTag = tag self.guesser.setTags(self.states) self.guesser.computeTheta() self.normalize() def normalize(self): totalStart = sum(self.pi) assert totalStart != 0 self.pi = map(lambda x:x * 1.0 / totalStart, self.pi) for state in range(len(self.states)): totalEmisOut = sum(self.emissions[state].itervalues()) assert totalEmisOut != 0 self.emissions[state] = dict(map(lambda x:(x[0], x[1] * 1.0 / totalEmisOut), self.emissions[state].items())) totalTransOut = sum(self.transitions[state]) assert totalTransOut != 0 self.transitions[state] = map(lambda x:x * 1.0 / totalTransOut, self.transitions[state]) def laplaceSmoothTransitions(self): alpha = 1.0/len(self.states) for state in range(len(self.states)): for trState in range(len(self.states)): if self.transitions[state][trState] == 0: self.transitions[state][trState] = alpha totalTransOut = sum(self.transitions[state]) self.transitions[state] = map(lambda x:x * 1.0 / totalTransOut, self.transitions[state])