Exemple #1
0
    def addToGrammar(self, grammar, level):
        for pos in self.termCounts:
            #pos tags are not merged by this ctf
#            print pos
            grammar.addAncestry(level, pos, pos)

        for lhs,subtab in self.ruleCounts.items():
            for rhs, prob in subtab.items():
                rule = HierRule(level)
                if rhs[0] == "EPSILON":
                    rhs = []
                rule.setup(lhs, rhs, prob)

                if rule.epsilon():
                    grammar.addEpsilonRule(rule)
                else:
                    grammar.addRule(rule)

        for lhs, subtab in self.termCounts.items():
            for word, prob in subtab.items():
                rule = HierRule(level)
                rule.setup(lhs, [word,], prob)
                grammar.addTerminalRule(rule)

        for lhs, subtab in self.ntToWord.items():
            for word, prob in subtab.items():
                grammar.addWordLookahead(lhs, word, prob, level)

        for lhs, subtab in self.ntToPos.items():
            self.ntToPos[lhs] = dict(subtab)
        grammar.addNTToPos(self.ntToPos, level)
        grammar.addLambdas(self.lambdas, level)
                rhs = [rhs1, fields[3]]
                prob = fields[4]
            elif len(fields) == 4:
                rhs = [rhs1,]
                prob = fields[3]

            prob = float(prob)

            rule = HierRule(level)

            if lhs.startswith("EPSILON"):
                assert(len(rhs) == 1)
                assert(rhs[0].startswith("EPSILON"))
                rhs = []

            rule.setup(lhs, rhs, prob)

            if rule.epsilon() or rule.unary():
#                print >>sys.stderr, "Skipping bogus unary", rule
                pass
            else:
                grammar.addRule(rule)

        unaryFile = workDir/("%s-txt-lvl%d.unaries.gz" % (basename, level))

        print >>sys.stderr, "Unaries from", unaryFile

        ct = 0
        for line in GzipFile(unaryFile):
            if ct % 1000 == 0:
                print >>sys.stderr, ct, "..."
            ct += 1
        
            fields = line.strip().split()
            (lhs, arrow, rhs1) = fields[0:3]
            assert(arrow == "->")
            if len(fields) == 5:
                rhs = [rhs1, fields[3]]
                prob = fields[4]
            elif len(fields) == 4:
                rhs = [rhs1,]
                prob = fields[3]

            prob = float(prob)

            rule = HierRule(level)
            rule.setup(lhs, rhs, prob)

            if [rule.lhs,] == rule.rhs and rule.prob == 1.0:
                print >>sys.stderr, "Warning: X->X", rule.lhs, rule.rhs
            else:
                grammar.addRule(rule)

    grammar.writeback("grammar")

    for level in range(maxLevel+1):
        print >>sys.stderr, "Level", level

        lexicon = workDir/("%s-txt-lvl%d.lexicon" % (basename, level))
        
        print >>sys.stderr, "Terminals from", lexicon