def addToGrammar(self, grammar, level): for pos in self.termCounts: #pos tags are not merged by this ctf # print pos grammar.addAncestry(level, pos, pos) for lhs,subtab in self.ruleCounts.items(): for rhs, prob in subtab.items(): rule = HierRule(level) if rhs[0] == "EPSILON": rhs = [] rule.setup(lhs, rhs, prob) if rule.epsilon(): grammar.addEpsilonRule(rule) else: grammar.addRule(rule) for lhs, subtab in self.termCounts.items(): for word, prob in subtab.items(): rule = HierRule(level) rule.setup(lhs, [word,], prob) grammar.addTerminalRule(rule) for lhs, subtab in self.ntToWord.items(): for word, prob in subtab.items(): grammar.addWordLookahead(lhs, word, prob, level) for lhs, subtab in self.ntToPos.items(): self.ntToPos[lhs] = dict(subtab) grammar.addNTToPos(self.ntToPos, level) grammar.addLambdas(self.lambdas, level)
rhs = [rhs1, fields[3]] prob = fields[4] elif len(fields) == 4: rhs = [rhs1,] prob = fields[3] prob = float(prob) rule = HierRule(level) if lhs.startswith("EPSILON"): assert(len(rhs) == 1) assert(rhs[0].startswith("EPSILON")) rhs = [] rule.setup(lhs, rhs, prob) if rule.epsilon() or rule.unary(): # print >>sys.stderr, "Skipping bogus unary", rule pass else: grammar.addRule(rule) unaryFile = workDir/("%s-txt-lvl%d.unaries.gz" % (basename, level)) print >>sys.stderr, "Unaries from", unaryFile ct = 0 for line in GzipFile(unaryFile): if ct % 1000 == 0: print >>sys.stderr, ct, "..."
ct += 1 fields = line.strip().split() (lhs, arrow, rhs1) = fields[0:3] assert(arrow == "->") if len(fields) == 5: rhs = [rhs1, fields[3]] prob = fields[4] elif len(fields) == 4: rhs = [rhs1,] prob = fields[3] prob = float(prob) rule = HierRule(level) rule.setup(lhs, rhs, prob) if [rule.lhs,] == rule.rhs and rule.prob == 1.0: print >>sys.stderr, "Warning: X->X", rule.lhs, rule.rhs else: grammar.addRule(rule) grammar.writeback("grammar") for level in range(maxLevel+1): print >>sys.stderr, "Level", level lexicon = workDir/("%s-txt-lvl%d.lexicon" % (basename, level)) print >>sys.stderr, "Terminals from", lexicon