def test_addNgramCount(self): order = 3 lm = NgramLM(order) li = "who knows who can do that ?".split() for i in xrange(len(li) - order + 1): lm.addNgramCount(li[i:i+order]) self.assertEqual(len(lm._root.count), len(li) - order + 1) self.assertEqual(len(lm._root.children), len(li) - order + 1) for i in xrange(order - 2, len(li) - 1): self.assertTrue(lm._root.children.has_key(li[i]))
def build_LM(in_file): """ build language models for each label each line in in_file contains a label and a string separated by a space """ print('building language models...') # This is an empty method # Pls implement your code below in # LMs is a dict to store the LM for each language LMs = {} with open(in_file, 'r') as file: for line in file: (label, text) = line.strip("\r\n").split(" ", 1) if label not in LMs: LMs[label] = NgramLM(label, gram_size=gram_size, token_based=token_based, start_end=start_end, case_sensitive=case_sensitive, strip_out=strip_out, add_one_smoothing=add_one_smoothing) LMs[label].train(text) return LMs
def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"): self.probHead = NgramLM(1) self.probLeft = NgramLM(3) self.probRight = NgramLM(3) self.countkey = countkey self.smoothing = smoothing
class DependencyLM(object): def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"): self.probHead = NgramLM(1) self.probLeft = NgramLM(3) self.probRight = NgramLM(3) self.countkey = countkey self.smoothing = smoothing def train(self, filename, modelFile, progress=10000): """ filename: Filename of CONLL format progress: print dot(.) every #progress trees; default: 10000 """ cnt = 0 print "Model training started" for depString in dph.readDependencyFile(filename): cnt += 1 if cnt % progress == 0: sys.stdout.write(".") tree = dph.stringToDependencyTreeWeakRef(depString) if tree is None: continue self.countFreq(tree) if self.smoothing == "ml": # Apppy maximum likelihood estimation to get probability from counts self.probHead.mlEstimate() self.probLeft.mlEstimate() self.probRight.mlEstimate() else: raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)") if cnt >= progress: print # Without this, when there are less than #progress trees, there will be an empty line, which is ugly in my opinion... print "Model training has successfully finished!" print "Writing model infomation to %s" % (modelFile, ) self.saveModelAsPlainText(modelFile) print "Finished writing model information to %s" % (modelFile, ) def saveModelAsPlainText(self, filename): with open(filename, "w") as model: model.write("[probHead]\n") self.probHead.saveNgramInfo(fstream=model) model.write("[probLeft]\n") self.probLeft.saveNgramInfo(fstream=model) model.write("[probRight]\n") self.probRight.saveNgramInfo(fstream=model) def readModelFromPlainText(self, file): isString = False if isinstance(file, basestring): model = open(file, "r") isString = True else: model = file for line in model: line = line.strip() if line in ["[probHead]", "[probLeft]", "[probRight]"]: state = line continue if line == "": continue ngram, count, logProb = line.split("\t") ngram = ngram.split(" ") count = int(count) if state == "[probHead]": self.probHead.addNgramCount(ngram, count) elif state == "[probLeft]": self.probLeft.addNgramCount(ngram, count) elif state == "[probRight]": self.probRight.addNgramCount(ngram, count) if self.smoothing == "ml": # Apppy maximum likelihood estimation to get probability from counts self.probHead.mlEstimate() self.probLeft.mlEstimate() self.probRight.mlEstimate() else: raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)") if isString: # If filestream is opened in this function, close fstream model.close() def saveModelAsProtocolBuffer(self, filename): with open(filename, "wb") as model: lmpb = depLM_pb2.depLM() self.probHead.writeMessage(lmpb.probHead.ngramEntries) self.probLeft.writeMessage(lmpb.probLeft.ngramEntries) self.probRight.writeMessage(lmpb.probRight.ngramEntries) model.write(lmpb.SerializeToString()) def readModelFromProtocolBuffer(self, file): isString = False if isinstance(file, basestring): model = open(file, "rb") isString = True else: model = file lmpb = depLM_pb2.depLM() lmpb.ParseFromString(model.read()) for ngramEntry in lmpb.probHead.ngramEntries: self.probHead.addNgramProb(ngramEntry.ngram, ngramEntry.prob) self.probHead.addNgramCount(ngramEntry.ngram, ngramEntry.count) for ngramEntry in lmpb.probLeft.ngramEntries: self.probLeft.addNgramProb(ngramEntry.ngram, ngramEntry.prob) self.probLeft.addNgramCount(ngramEntry.ngram, ngramEntry.count) for ngramEntry in lmpb.probRight.ngramEntries: self.probRight.addNgramProb(ngramEntry.ngram, ngramEntry.prob) self.probRight.addNgramCount(ngramEntry.ngram, ngramEntry.count) if isString: # If filestream is opened in this function, close fstream model.close() def countFreq(self, node): if node.parent is None: self.probHead.addNgramCount([self.countkey(node)]) left, right = node.partitionChildren() if len(left) > 0: self.probLeft.addNgramCount(["___none", self.countkey(left[0].parent)+"___head", self.countkey(left[0])]) for index in xrange(1, len(left)): self.probLeft.addNgramCount([self.countkey(left[index-1]), self.countkey(left[index].parent)+"___head", self.countkey(left[index])]) if len(right) > 0: self.probRight.addNgramCount(["___none", self.countkey(right[0].parent)+"___head", self.countkey(right[0])]) for index in xrange(1, len(right)): self.probRight.addNgramCount([self.countkey(right[index-1]), self.countkey(right[index].parent)+"___head", self.countkey(right[index])]) for child in node.children: self.countFreq(child)