コード例 #1
0
ファイル: inducer.py プロジェクト: keggsmurph21/cky-parser
def induce(parsed_sents):
    def get_rule_from_node(node):

        if not node.is_leaf():

            child_tags = []

            for child in node.children:

                if child.is_leaf():
                    leafs[node.tag].add(child.text)

                else:
                    child_tags.append(child.tag)

            if len(child_tags):

                key = slugify(child_tags)
                nodes[node.tag].add(key)

    g = Grammar()

    for parsed_sent in parsed_sents:

        root = Node()
        current = root

        for char in str(parsed_sent):

            if char == '(':

                child = Node()
                child.parent = current

                current.children.append(child)
                current = child

            elif char == ')':

                if isinstance(current, Leaf):
                    current = current.parent
                current = current.parent

            elif re.match(r'\s', char):
                current.tag_parsed = True

            else:

                if isinstance(current, Leaf):

                    current.text += char

                elif current.tag_parsed:

                    leaf = Leaf()
                    leaf.parent = current
                    leaf.text += char

                    current.children.append(leaf)
                    current = leaf

                else:

                    current.tag += char

        leafs = defaultdict(set)
        nodes = defaultdict(set)

        root.children[0].descend(get_rule_from_node)

        for (src, tar_set) in nodes.items():
            for slugged_tars in tar_set:

                tars = deslugify(slugged_tars)
                g.add_rule(src, tars, False)

        for (src, tars) in leafs.items():
            g.add_rule(src, tars, True)

    g.dedup()
    return g