Beispiel #1
0
 def __init__(self):
   self.pi = []
   self.transitions = dd(list)
   self.emissions = []
   self.states = []
   self.nstates = 0
   self.guesser = Guesser(10)
   self.vocab = set()
Beispiel #2
0
class Learner:
  def __init__(self):
    self.pi = []
    self.transitions = dd(list)
    self.emissions = []
    self.states = []
    self.nstates = 0
    self.guesser = Guesser(10)
    self.vocab = set()
  
  def train(self, sentences):
    for sentence in sentences:
      self.guesser.loadWords(sentence)
      self.vocab.add(sentence[0][0])
      prevTag = sentence[0][1]
      word = sentence[0][0]
      firstState = -1
      try:
        firstState = self.states.index(prevTag)
      except:
        firstState = len(self.states)
        self.pi.append(1)
        for state in range(len(self.states)):
          self.transitions[state].append(0)
        self.states.append(prevTag)
        self.nstates += 1 
        self.transitions[firstState] = [0] * len(self.states)
        self.emissions.append({})
      self.pi[firstState] += 1
      self.emissions[firstState][word] = self.emissions[firstState].get(word, 0) + 1
      for word, tag in sentence[1:]:
        self.vocab.add(word)
        state = -1
        try:
          state = self.states.index(tag)
        except:
          self.pi.append(0)
          state = len(self.states)
          for s in range(len(self.states)):
            self.transitions[s].append(0)
          self.states.append(tag)
          self.nstates += 1
          self.transitions[state] = [0] * len(self.states)
          self.emissions.append({})
        self.emissions[state][word] = self.emissions[state].get(word, 0) + 1
        self.transitions[self.states.index(prevTag)][state] += 1 
        prevTag = tag
    self.guesser.setTags(self.states)
    self.guesser.computeTheta()
    self.normalize()

  def normalize(self):
    totalStart = sum(self.pi)
    assert totalStart != 0
    self.pi = map(lambda x:x * 1.0 / totalStart, self.pi)
    for state in range(len(self.states)):
      totalEmisOut = sum(self.emissions[state].itervalues())
      assert totalEmisOut != 0
      self.emissions[state] = dict(map(lambda x:(x[0], x[1] * 1.0 / totalEmisOut), self.emissions[state].items()))
      totalTransOut = sum(self.transitions[state])
      assert totalTransOut != 0
      self.transitions[state] = map(lambda x:x * 1.0 / totalTransOut, self.transitions[state])
  
  def laplaceSmoothTransitions(self):
    alpha = 1.0/len(self.states)
    for state in range(len(self.states)):
      for trState in range(len(self.states)):
        if self.transitions[state][trState] == 0:
          self.transitions[state][trState] = alpha
      totalTransOut = sum(self.transitions[state])
      self.transitions[state] = map(lambda x:x * 1.0 / totalTransOut, self.transitions[state])