Esempio n. 1
0
        def generate_derivation(hg  # type: HyperGraph
                                ):
            lexicons = list(cfg.generate_words())
            assert len(lexicons) == len(spans)
            rules = list(cfg.generate_rules())

            for span, lexicon in zip(spans, lexicons):
                lexicon.span = span

            count = 1
            last_new_edge = None

            for rule in rules:
                new_span = (rule.child[0].span[0], rule.child[-1].span[1])
                rule.span = new_span

                result = detect_func(hg, rule)
                if result is None:
                    rule.has_semantics = False
                    continue
                else:
                    rule.has_semantics = True
                    all_edges, internal_nodes, external_nodes = result

                new_edge = HyperEdge(external_nodes, rule.tag, False, new_span)

                new_nodes = hg.nodes - internal_nodes
                new_edges = (hg.edges - all_edges) | {new_edge}

                hg_new = HyperGraph(new_nodes, new_edges)
                node_rename_map, hrg_rule = HRGRule.extract(
                    all_edges, internal_nodes, external_nodes, rule.tag)

                if draw:
                    pic_path = "/tmp/a3/{}/{}".format(sent_id, count)
                    pics.append(
                        cls.draw(hg,
                                 pic_path,
                                 all_edges,
                                 internal_nodes,
                                 external_nodes,
                                 last_new_edge,
                                 draw_format=draw_format))

                hg = hg_new
                last_new_edge = new_edge
                count += 1
                hrg_rule.cfg = cls.convert_cfg_node(rule)
                yield node_rename_map, hrg_rule

            if draw:
                pic_path = "/tmp/a3/{}/{}".format(sent_id, count)
                pics.append(
                    cls.draw(hg,
                             pic_path,
                             last_new_edge=last_new_edge,
                             draw_format=draw_format))
Esempio n. 2
0
 def transform_edge(self, edge, lexicon):
     if "NEWLEMMA" in edge.label:
         word = lexicon.string.replace("_", "+")
         if "_u_unknown" in edge.label:
             item = word
         else:
             pos = edge.label[edge.label.find("NEWLEMMA") + 10]
             if pos in ("n", "v", "a"):
                 item = self.lemmatizer.lemmatize(word, pos)
             else:
                 item = self.lemmatizer.lemmatize(lexicon.string.replace("_", "+"))
         new_label = edge.label.format(NEWLEMMA=item)
         # print(edge.label, lexicon, item, new_label)
         return HyperEdge(edge.nodes, new_label,
                          edge.is_terminal, edge.span)
     return edge
Esempio n. 3
0
        def generate_derivation(hg  # type: HyperGraph
                                ):
            rules = list(cfg.generate_rules())  # root last

            count = 1
            last_new_edge = None

            for rule in rules:
                new_span = (rule.child[0].span[0], rule.child[-1].span[1])
                rule.span = new_span

                result = detect_func(hg, rule)

                # null semantic node
                if result is None:
                    rule.has_semantics = False
                    if lexicalize_null_semantic:
                        cfg_rhs = tuple((j, None) for j in rule.generate_words(
                        ))  # type: Tuple[Tuple[Lexicon, None]]
                    else:
                        cfg_rhs = tuple(
                            (i if isinstance(i, Lexicon) else i.tag, None)
                            for i in rule.child)
                    yield CFGRule(rule.tag, cfg_rhs, None)
                    continue
                else:
                    rule.has_semantics = True
                    all_edges, internal_nodes, external_nodes = result

                new_edge = HyperEdge(external_nodes, rule.tag, False, new_span)

                new_nodes = hg.nodes - internal_nodes
                new_edges = (hg.edges - all_edges) | {new_edge}

                hg_new = HyperGraph(new_nodes, new_edges)
                node_rename_map, hrg_rule = HRGRule.extract(
                    all_edges, internal_nodes, external_nodes, rule.tag, rule)

                if draw:
                    pic_path = "/tmp/a3/{}/{}".format(sent_id, count)
                    pics.append(
                        HRGDerivation.draw(hg,
                                           pic_path,
                                           all_edges,
                                           internal_nodes,
                                           external_nodes,
                                           last_new_edge,
                                           draw_format=draw_format))

                hg = hg_new
                last_new_edge = new_edge
                count += 1

                if isinstance(rule.child[0], Lexicon):
                    # leaf node
                    assert len(rule.child) == 1
                    cfg_rhs = ((rule.child[0], None), )
                else:
                    # internal node
                    assert all(isinstance(i, ConstTree) for i in rule.child)
                    cfg_rhs = []
                    for i in rule.child:
                        if not i.has_semantics:
                            if lexicalize_null_semantic:
                                cfg_rhs.extend(
                                    (j, None) for j in i.generate_words())
                            else:
                                cfg_rhs.append((i.tag, None))
                        else:
                            # find corresponding hyperedge in hrg rule for this tree node
                            target_edges = [
                                j for j in all_edges if j.span == i.span
                            ]
                            assert len(target_edges) == 1
                            if target_edges[0].label != i.tag:
                                print("Non-consistent CFG and HRG: ",
                                      " ".join(j.string
                                               for j in rule.generate_words()),
                                      file=sys.stderr)
                                cfg_rhs = None
                                break
                            target_edges_r = HyperEdge(
                                (node_rename_map[node]
                                 for node in target_edges[0].nodes),
                                target_edges[0].label,
                                target_edges[0].is_terminal)
                            cfg_rhs.append((i.tag, target_edges_r))

                if cfg_rhs is not None:
                    yield CFGRule(rule.tag, tuple(cfg_rhs), hrg_rule)
                else:
                    yield CFGRule(rule.tag, cfg_rhs, None)

            if draw:
                pic_path = "/tmp/a3/{}/{}".format(sent_id, count)
                pics.append(
                    HRGDerivation.draw(hg,
                                       pic_path,
                                       last_new_edge=last_new_edge,
                                       draw_format=draw_format))
Esempio n. 4
0
    def extract(
            cls,
            edges,  # type: Set[HyperEdge]
            internal_nodes,  # type: Set[GraphNode]
            external_nodes,  # type: Set[GraphNode]
            label,  # type: str
            cfg_rule=None):
        nodes = internal_nodes.union(external_nodes)
        edge_by_node = defaultdict(
            list)  # node -> (edge, index of this node in this edge)
        for edge in edges:
            for idx, node in enumerate(edge.nodes):
                edge_by_node[node].append((edge, idx))

        default_hash = hashlib.md5(b"13").digest()
        node_hashes = {node: default_hash for node in nodes}  # node -> hash

        def get_edge_hashes(
                node_hashes,  # type: Dict[GraphNode, bytes]
                edge,  # type: HyperEdge
                idx  # type: int
        ):
            md5_obj = hashlib.md5((edge.label + "#" + str(idx)).encode())
            for adj_node in edge.nodes:
                md5_obj.update(node_hashes[adj_node] + b"#")
            return md5_obj.digest()

        def get_sibling_hashes(
                node_hashes,  # type: Dict[GraphNode, bytes]
                node  # type: GraphNode
        ):
            md5_obj = hashlib.md5()
            edge_hashes = sorted(
                get_edge_hashes(node_hashes, edge, idx)
                for edge, idx in edge_by_node[node])
            for h in edge_hashes:
                md5_obj.update(h)
            return md5_obj.digest()

        for cycle in range(10):
            new_node_hashes = {}
            # recalculate hashes
            for node in nodes:
                md5_obj = hashlib.md5()
                md5_obj.update(get_sibling_hashes(node_hashes, node))
                md5_obj.update(b'\x01' if node in external_nodes else b'\x00')
                new_node_hashes[node] = md5_obj.digest()
            node_hashes = new_node_hashes

        nodes_in_order = sorted(node_hashes.items(), key=itemgetter(1))

        node_rename_map = {}
        for node_idx, (node, hash_value) in enumerate(nodes_in_order):
            node_rename_map[node] = GraphNode(str(node_idx))

        # get rhs
        new_edges = []
        for edge in edges:
            new_edges.append(
                HyperEdge((node_rename_map[node] for node in edge.nodes),
                          edge.label, edge.is_terminal))
        rhs = HyperGraph(frozenset(node_rename_map.values()),
                         frozenset(new_edges))

        # determine external nodes permutation
        def get_external_nodes_permutation():
            if len(external_nodes) == 2:
                for permutation in permutations(external_nodes):
                    if any(edge.nodes == permutation for edge in edges):
                        return [node_rename_map[i] for i in permutation]
                if cfg_rule is not None and len(cfg_rule.child) == 2:
                    left_span = cfg_rule.child[0].span
                    right_span = cfg_rule.child[1].span
                    left_node = [
                        edge.nodes[0] for edge in edges
                        if len(edge.nodes) == 1 and edge.span == left_span
                    ]
                    right_node = [
                        edge.nodes[0] for edge in edges
                        if len(edge.nodes) == 1 and edge.span == right_span
                    ]
                    if left_node and right_node and {
                            left_node[0], right_node[0]
                    } == external_nodes:
                        # print("Permutation rule 2 used")
                        return [
                            node_rename_map[left_node[0]],
                            node_rename_map[right_node[0]]
                        ]
            return sorted((node_rename_map[i] for i in external_nodes),
                          key=lambda x: int(x.name))

        # get lhs
        lhs = HyperEdge(get_external_nodes_permutation(),
                        label=label,
                        is_terminal=False)
        return node_rename_map, cls(lhs, rhs)
Esempio n. 5
0
 def transform_edge(mapping, edge, span):
     return HyperEdge((mapping[i] for i in edge.nodes), edge.label,
                      edge.is_terminal, span)
Esempio n. 6
0
 def transform_edge(mapping, edge):
     """ transform the edge in the rule into edge in concrete graph."""
     return HyperEdge((mapping[i] for i in edge.nodes), edge.label,
                      edge.is_terminal, None)
Esempio n. 7
0
 def transform_edge_2(mapping, edge):
     """ transform the edge in the rule into edge in concrete graph."""
     return HyperEdge(((mapping.get(i) or i) for i in edge.nodes),
                      edge.label,
                      edge.is_terminal,
                      edge.span)
Esempio n. 8
0
    def sync_grammar_fallback_2(self, tree_node):
        rule_name, main_node_count = tree_node.tag.rsplit("#", 1)
        word = tree_node.children[0].string
        main_node_count = int(main_node_count)
        if main_node_count == 1:
            main_node = GraphNode("0")
            surface = tree_node.children[0].string

            if self.pattern_number.match(surface):
                label = "card"
            elif rule_name.find("generic_proper") >= 0:
                label = "named"
            else:
                lemma = self.lemmatizer.lemmatize(word)
                if rule_name.find("n_-_c-pl-unk_le") >= 0:
                    label = "_{}/nns_u_unknown".format(lemma)
                elif rule_name.find("n_-_mc_le") >= 0 or rule_name.find("n_-_c_le") >= 0:
                    label = "_{}_n_1".format(lemma)  # more number is used
                elif rule_name.find("generic_mass_count_noun") >= 0:
                    label = "_{}/nn_u_unknown".format(lemma)  # more number is used
                else:
                    candidates = self.lexicon_mapping[HLexicon(word), main_node_count]
                    if candidates:
                        return candidates
                    else:
                        label = "named"

            old_edge = HyperEdge(
                nodes=[main_node],
                label=rule_name,
                is_terminal=False
            )

            main_edge = HyperEdge(
                nodes=[main_node],
                label=label,
                is_terminal=True
            )

            fallback = CFGRule(lhs=rule_name,
                               rhs=((tree_node.children[0], None),),
                               hrg=HRGRule(
                                   lhs=old_edge,
                                   rhs=HyperGraph(
                                       nodes=frozenset([main_node]),
                                       edges=frozenset({main_edge})
                                   )
                               ))
        else:
            ret1 = self.terminal_mapping.get(tree_node.tag)
            if ret1:
                return Counter([ret1.most_common(1)[0][0]])
            connected_nodes = [GraphNode(str(i)) for i in range(main_node_count)]
            centural_node = GraphNode(str(main_node_count + 1))
            old_edge = HyperEdge(
                nodes=connected_nodes,
                label=rule_name,
                is_terminal=False
            )
            main_edges = [HyperEdge(
                nodes=[centural_node, i],
                label="???",
                is_terminal=True
            ) for i in connected_nodes]
            fallback = CFGRule(lhs=rule_name,
                               rhs=((tree_node.children[0], None),),
                               hrg=HRGRule(
                                   lhs=old_edge,
                                   rhs=HyperGraph(
                                       nodes=frozenset(connected_nodes + [centural_node]),
                                       edges=frozenset(main_edges)
                                   )
                               ))
        return Counter([fallback])