Esempio n. 1
0
def delete_nodes(graph, nodes):
    g = Hgraph()
    for p, r, ch in graph.triples():
        if (p not in nodes) and (not (len(ch) == 1 and ch[0] in nodes)):
            g._add_triple(p, r, ch, warn=False)
        else:
            if (p not in nodes) and (p not in g):
                g[p] = ListMap()
            if len(ch) == 1 and (ch[0] not in nodes) and (ch[0] not in g):
                g[ch[0]] = ListMap()
    g.roots = g.find_roots(warn=False)
    return g
Esempio n. 2
0
    def to_hgraph(self):
        from common.hgraph.hgraph import Hgraph
        hgraph = Hgraph()
        hgraph.my_hyper_graph = self

        for node in self.nodes:  # type: GraphNode
            label = ""
            ext_id = None
            ident = "_" + node.name

            # Insert a node into the AMR
            ignoreme = hgraph[ident]  # Initialize dictionary for this node
            hgraph.node_to_concepts[ident] = label
            if ext_id is not None:
                if ident in hgraph.external_nodes and hgraph.external_nodes[
                        ident] != ext_id:
                    raise Exception(
                        "Incompatible external node IDs for node %s." % ident)
                hgraph.external_nodes[ident] = ext_id
                hgraph.rev_external_nodes[ext_id] = ident
            if node.is_root:
                hgraph.roots.append(ident)

        for edge in self.edges:  # type: HyperEdge
            hyperchild = tuple("_" + node.name for node in edge.nodes[1:])
            ident = "_" + edge.nodes[0].name
            new_edge = edge.label
            hgraph._add_triple(ident, new_edge, hyperchild)

        return hgraph
Esempio n. 3
0
def get_line_graph(graph):

    lgraph = Hgraph()
    edges_for_node = defaultdict(list)

    for p, r, ch in graph.triples():
        edges_for_node[p].append(str((p, r, ch)))
        for c in ch:
            edges_for_node[c].append(str((p, r, ch)))

    for r in edges_for_node:
        for p, c in itertools.combinations(edges_for_node[r], 2):
            lgraph._add_triple(p, r, (c,), warn=False)
            lgraph._add_triple(c, r, (p,), warn=False)
    lgraph.roots = lgraph.find_roots()
    return lgraph
Esempio n. 4
0
    def parse_bitexts(self, pair_iterator):
        """
      Parse all pairs of input objects returned by the pair iterator. 
      This is a generator.
      """
        for line1, line2 in pair_iterator:
            if self.grammar.rhs1_type == "hypergraph":
                obj1 = Hgraph.from_string(line1)
            else:
                obj1 = line1.strip().split()

            if self.grammar.rhs2_type == "hypergraph":
                obj2 = Hgraph.from_string(line2)
            else:
                obj2 = line2.strip().split()

            raw_chart = self.parse_bitext(obj1, obj2)
            yield cky_chart(raw_chart)
Esempio n. 5
0
 def combiner(item, childobjs):
     graph = leaf(item)
     for nt, cgraph in childobjs.items():
             p,r,c = graph.find_nt_edge(*nt)
             fragment = Hgraph.from_triples([(p,r,c)],graph.node_to_concepts)
             try:
                 graph = graph.replace_fragment(fragment, cgraph)
             except AssertionError, e:
                 raise DerivationException, "Incompatible hyperedge type for nonterminal %s." % str(nt[0])
Esempio n. 6
0
    def parse_bitexts(self, pair_iterator):
        """
      Parse all pairs of input objects returned by the pair iterator. 
      This is a generator.
      """
        for line1, line2 in pair_iterator:
            if self.grammar.rhs1_type == "hypergraph":
                obj1 = Hgraph.from_string(line1)
            else:
                obj1 = line1.strip().split()

            if self.grammar.rhs2_type == "hypergraph":
                obj2 = Hgraph.from_string(line2)
            else:
                obj2 = line2.strip().split()

            raw_chart = self.parse_bitext(obj1, obj2)
            yield cky_chart(raw_chart)
Esempio n. 7
0
 def combiner(item, childobjs):
     graph = leaf(item)
     for nt, cgraph in childobjs.items():
         p, r, c = graph.find_nt_edge(*nt)
         fragment = Hgraph.from_triples([(p, r, c)], graph.node_to_concepts)
         try:
             graph = graph.replace_fragment(fragment, cgraph)
         except AssertionError, e:
             raise DerivationException, "Incompatible hyperedge type for nonterminal %s." % str(
                 nt[0])
Esempio n. 8
0
def get_binarized_partitions(graph, edgesets):
    if len(edgesets) == 1:
        yield [graph.triples()]
        return

    gen = get_partitions(graph, edgesets[0], [edge for edgeset in edgesets[1:] for edge in edgeset])

    for left_edges, right_edges in gen:

        possibilities = get_binarized_partitions(Hgraph.from_triples(right_edges, {}, warn=False), edgesets[1:])
        poss_list = list(possibilities)
        for partitions in poss_list:
            yield [left_edges] + partitions
Esempio n. 9
0
        def compute_chart(
            tree, graph, prefix=""
        ):  # Recursively compute the chart. Graph is the sub-graph we're considering, tree is the tree of spans.
            count[0] += 1

            triples = set(graph.triples())
            edge_vector = tuple([1 if x in triples else 0 for x in graph_edge_list])

            leaves = tree.leaves()
            # if not isinstance(tree,fancy_tree.FancyTree):
            #    triples = set(graph.triples())
            #    edge_vector= tuple(1 for x in graph_edge_list if x in triples else 0)
            #    return Partition(tree.node, leaves[0], leaves[-1], edge_vector)
            # else:
            if len(tree) == 1 and not isinstance(tree[0], fancy_tree.FancyTree):
                return Partition(tree.node, leaves[0], leaves[-1], edge_vector)

            # First get the set of aligned edges for this constituent and it's children
            aligned_edges_for_span = set([edge for token in tree.leaves() for edge in rev_alignments[token]])

            partition_object = Partition(tree.node, leaves[0], leaves[-1], edge_vector)
            if partition_object not in chart:

                try:
                    possibilities = []
                    child_edgesets = []
                    # Compute edge set for each child
                    for t in tree:
                        edgeset = []
                        for l in t.leaves():
                            edgeset.extend(rev_alignments[l])
                        child_edgesets.append(edgeset)

                    # For each possible partitioning
                    for cparts in get_binarized_partitions(graph, child_edgesets):
                        child_forests = []
                        for i in range(len(tree)):
                            childgraph = Hgraph.from_triples(cparts[i], {}, warn=False)
                            sub_forest = compute_chart(tree[i], childgraph, prefix=prefix + " ")
                            if len(chart) > MAX_CHART_SIZE:
                                raise ChartTooBigException, "Chart size exceeded 5000 entries. dropping this sentence."
                            child_forests.append(sub_forest)
                        possibilities.append(child_forests)

                    chart[partition_object] = possibilities
                except IncompatibleAlignmentException:
                    chart.inconsistent_alignment = (tree.node, leaves[0], leaves[-1])

                    return partition_object
            return partition_object
Esempio n. 10
0
def test():
    tree = FancyTree("""
    (S
        (NP (DT The) (NN boy))
        (VP (VBZ wants)
          (NP (DT the) (NN girl)
            (S
              (VP (TO to)
                (VP (VB believe)
                  (NP (PRP him)))))))
        (. .))""")
    graph = Hgraph.from_string(
        "(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))")
    graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]}
    graph = graph.to_instance_edges()
Esempio n. 11
0
def test():
    tree = FancyTree(
        """
    (S
        (NP (DT The) (NN boy))
        (VP (VBZ wants)
          (NP (DT the) (NN girl)
            (S
              (VP (TO to)
                (VP (VB believe)
                  (NP (PRP him)))))))
        (. .))"""
    )
    graph = Hgraph.from_string("(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))")
    graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]}
    graph = graph.to_instance_edges()
Esempio n. 12
0
def get_binarized_partitions(graph, edgesets):
    if len(edgesets) == 1:
        yield [graph.triples()]
        return

    gen = get_partitions(
        graph, edgesets[0],
        [edge for edgeset in edgesets[1:] for edge in edgeset])

    for left_edges, right_edges in gen:

        possibilities = get_binarized_partitions(
            Hgraph.from_triples(right_edges, {}, warn=False), edgesets[1:])
        poss_list = list(possibilities)
        for partitions in poss_list:
            yield [left_edges] + partitions
Esempio n. 13
0
def delete_nodes(graph, nodes):
    g = Hgraph()
    for p, r, ch in graph.triples():
        if (p not in nodes) and (not (len(ch) == 1 and ch[0] in nodes)):
            g._add_triple(p, r, ch, warn=False)
        else:
            if (p not in nodes) and (p not in g):
                g[p] = ListMap()
            if len(ch) == 1 and (ch[0] not in nodes) and (ch[0] not in g):
                g[ch[0]] = ListMap()
    g.roots = g.find_roots(warn=False)
    return g
Esempio n. 14
0
  def merge_string_nonterminals(self, string, amr, next_id):
    """
    Binarizes a string-graph pair consisting entirely of nonterminals, ensuring
    correct visit order for parsing.
    """
    rules = []
    stack = []
    tokens = list(reversed([s for s in string if s]))
    # standard shift-reduce binarization algorithm
    # TODO add citation after paper is published
    while tokens:
      next_tok = tokens.pop()
      next_tok_triple_l = [t for t in amr.triples() if str(t[1]) == next_tok]
      assert len(next_tok_triple_l) == 1
      next_tok_triple = next_tok_triple_l[0]
      if not stack:
        stack.append(next_tok)
        continue
      stack_top = stack.pop()
      stack_top_triple = [t for t in amr.triples() if str(t[1]) == stack_top][0]

      if (stack_top_triple[0] not in next_tok_triple[2]) and \
          (next_tok_triple[0] not in stack_top_triple[2]):
        # can't merge, so shift
        stack.append(stack_top)
        stack.append(next_tok)
        continue

      # can merge, so reduce
      rule_amr = Hgraph.from_triples([stack_top_triple, next_tok_triple])
      assert len(rule_amr.roots) == 1

      rule_string = [stack_top, next_tok]
      fictitious_tree = self.make_fictitious_tree(string, rule_string)
      new_rule, tree, amr, next_id = self.make_rule(fictitious_tree, amr,
          Tree('X', rule_string), rule_amr, next_id)
      string = tree.leaves()
      tokens.append('#%s' % new_rule.symbol)
      rules.append(new_rule)

    if len(stack) > 1:
      raise BinarizationException

    return string, amr, rules, next_id
Esempio n. 15
0
def replace_instance_edges(graph, tree):
    t = []
    alignments = {}
    tree_leaves = tree.leaves()
    for e in graph.triples():

        if e in graph.edge_alignments:
            p, r, ch = e
            token = graph.edge_alignments[e][0]  # TODO: not sure what to do with multiple tokens
            new_edge = (p, "%s'" % tree_leaves[token], ch)
            t.append(new_edge)
            alignments[new_edge] = [token]
        else:
            t.append(e)
    res = Hgraph.from_triples(t, {}, warn=False)
    res.edge_alignments = alignments
    res.node_alignments = graph.node_alignments
    res.roots = graph.roots
    res.external_nodes = graph.external_nodes
    return res
Esempio n. 16
0
  def collapse_amr_terminals(self, tree, amr, next_id):
    """
    Creates new rules by merging terminal subgraphs with their closest
    nonterminal edge.
    """
    # triples returns in breadth-first order, so first triples in the list are
    # closest to the root of the AMR
    nonterminals = list(reversed([t for t in amr.triples() if isinstance(t[1],
      NonterminalLabel)]))
    rules = []
    first = True
    while nonterminals:
      nt = nonterminals.pop()
      # in general, we will attach to a given nonterminal edge all of the
      # terminal edges reachable from its tail nodes
      attached_terminals = self.terminal_search(nt, amr.triples())
      if first:
        # we still have to handle terminal edges that are higher than any
        # nonterminal edge
        # because the first nonterminal edge is closest to the root of the AMR,
        # it must be reachable from the root without passing through any other
        # nonterminal, so we can attach all the high terminals (those reachable
        # from the root) to the first nonterminal
        attached_terminals |= self.terminal_search(amr.root_edges()[0],
            amr.triples())
        attached_terminals |= {amr.root_edges()[0]}
        first = False
      # don't bother making a rule when there's nothing to collapse
      if not attached_terminals:
        continue

      rule_amr = Hgraph.from_triples({nt} | attached_terminals)
      rule_tree = str(nt[1])

      assert len(rule_amr.roots) == 1

      new_rule, tree, amr, next_id = self.make_rule(tree, amr, rule_tree,
          rule_amr, next_id)
      rules.append(new_rule)

    return tree, amr, rules, next_id
Esempio n. 17
0
def replace_instance_edges(graph, tree):
    t = []
    alignments = {}
    tree_leaves = tree.leaves()
    for e in graph.triples():

        if e in graph.edge_alignments:
            p, r, ch = e
            token = graph.edge_alignments[e][
                0]  # TODO: not sure what to do with multiple tokens
            new_edge = (p, "%s'" % tree_leaves[token], ch)
            t.append(new_edge)
            alignments[new_edge] = [token]
        else:
            t.append(e)
    res = Hgraph.from_triples(t, {}, warn=False)
    res.edge_alignments = alignments
    res.node_alignments = graph.node_alignments
    res.roots = graph.roots
    res.external_nodes = graph.external_nodes
    return res
Esempio n. 18
0
    def rhs_to_hgraph(self):
        from common.cfg import NonterminalLabel
        from common.hgraph.hgraph import Hgraph
        nt_id_count = 0
        hgraph = Hgraph()

        for node in self.rhs.nodes:  # type: GraphNode
            label = ""
            try:
                ext_id = self.lhs.nodes.index(node)
            except ValueError:
                ext_id = None
            ident = "_" + node.name

            # Insert a node into the AMR
            ignoreme = hgraph[ident]  # Initialize dictionary for this node
            hgraph.node_to_concepts[ident] = label
            if ext_id is not None:
                if ident in hgraph.external_nodes and hgraph.external_nodes[
                        ident] != ext_id:
                    raise Exception(
                        "Incompatible external node IDs for node %s." % ident)
                hgraph.external_nodes[ident] = ext_id
                hgraph.rev_external_nodes[ext_id] = ident
            if ext_id == 0:
                hgraph.roots.append(ident)

        for edge in self.rhs.edges:  # type: HyperEdge
            hyperchild = tuple("_" + node.name for node in edge.nodes[1:])
            ident = "_" + edge.nodes[0].name
            if "_" not in edge.label and not edge.label.startswith("ARG") \
                    and not edge.label.startswith("BV"):
                # this is a nonterminal Edge
                new_edge = NonterminalLabel(edge.label)
                if not new_edge.index:
                    new_edge.index = "_%i" % nt_id_count
                    nt_id_count = nt_id_count + 1
            else:
                new_edge = edge.label

            hgraph._add_triple(ident, new_edge, hyperchild)

        return hgraph
Esempio n. 19
0
def get_line_graph(graph):

    lgraph = Hgraph()
    edges_for_node = defaultdict(list)

    for p, r, ch in graph.triples():
        edges_for_node[p].append(str((p, r, ch)))
        for c in ch:
            edges_for_node[c].append(str((p, r, ch)))

    for r in edges_for_node:
        for p, c in itertools.combinations(edges_for_node[r], 2):
            lgraph._add_triple(p, r, (c, ), warn=False)
            lgraph._add_triple(c, r, (p, ), warn=False)
    lgraph.roots = lgraph.find_roots()
    return lgraph
Esempio n. 20
0
  def merge_tree_symbols(self, tree, amr, next_id):
    """
    Binarizes a tree-graph pair according to the binariziation dictated by the
    tree. WILL FAIL OFTEN IF TREE IS NOT BINARIZED.
    """
    rules = []
    while True:
      if not isinstance(tree, Tree):
        assert len(amr.triples()) == 1
        return tree, amr, rules, next_id

      # a collapsible subtree consists of
      # 1. many terminals
      # 2. one nonterminal and many terminals
      # 3. two nonterminals
      collapsible_subtrees = []
      for st in tree.subtrees():
        terminals = [t for t in st.leaves() if t[0] == '#']
        if len(terminals) == 1:
          collapsible_subtrees.append(st)
        elif len(terminals) == 2 and len(st.leaves()) == 2:
          collapsible_subtrees.append(st)

      # if there are no subtrees to collapse, this rule isn't binarizable
      if len(collapsible_subtrees) == 0:
        raise BinarizationException

      rule_tree = max(collapsible_subtrees, key=lambda x: x.height())
      terminals = [t for t in rule_tree.leaves() if t[0] == '#']
      rule_edge_l = [t for t in amr.triples() if str(t[1]) in terminals]
      rule_amr = Hgraph.from_triples(rule_edge_l)
      # if the induced graph is disconnected, this rule isn't binarizable
      if len(rule_amr.roots) != 1:
        raise BinarizationException

      new_rule, tree, amr, next_id = self.make_rule(tree, amr, rule_tree,
          rule_amr, next_id)
      rules.append(new_rule)

    return tree, amr, rules, next_id
Esempio n. 21
0
  def collapse_string_terminals(self, string, amr, next_id):
    """
    Creates new rules by merging terminal tokens with their closest nonterminal.
    All terminals attach to the left (except for terminals left of the first
    nonterminal, which attach right).
    """
    nonterminals = list(reversed([t for t in string if t[0] == '#']))
    rules = []
    # attach first terminals to the right
    slice_from = 0

    while nonterminals:
      nt = nonterminals.pop()
      if nonterminals:
        slice_to = string.index(nonterminals[-1])
      else:
        slice_to = len(string)
      if slice_to - slice_from == 1:
        # there are no terminals to attach here, so skip ahead
        slice_from = slice_to
        continue

      rule_string = string[slice_from:slice_to]
      nt_edge_l = [e for e in amr.triples(nodelabels = self.nodelabels) if str(e[1]) == nt]
      assert len(nt_edge_l) == 1
      rule_amr = Hgraph.from_triples(nt_edge_l)

      # hallucinate a tree with acceptable structure for make_rule
      fictitious_tree = self.make_fictitious_tree(string, rule_string)
      new_rule, tree, amr, next_id = self.make_rule(fictitious_tree, amr,
          Tree('X', rule_string), rule_amr, next_id)
      string = tree.leaves()
      rules.append(new_rule)

      slice_from = slice_from + 1

    return string, amr, rules, next_id
Esempio n. 22
0
def main():
    graphs = set([line.strip().split('\t')[0] for line in file(sys.argv[1])])
    for i, graph in enumerate(graphs):
        g = Hgraph.from_string(graph)
        g.render_to_file("{0}_{1}.jpg".format(sys.argv[2], i))

def tree_decomposition_edge(graph_edge, visited, amr, nodelabels=False):
    visited.add(graph_edge)
    tree_node = TreeNode()
    if nodelabels:
        head = graph_edge[0][0]
        if graph_edge[2]:
            nodes, labels = zip(*graph_edge[2])
        else:
            nodes = ()
    else:
        head = graph_edge[0]
        nodes = graph_edge[2]

    tree_node.graph_nodes.add(head)
    tree_node.graph_nodes |= set(nodes)

    tree_node.graph_edge = graph_edge
    tree_node.first_child = tree_decomposition_node(nodes,
                                                    visited,
                                                    amr,
                                                    nodelabels=nodelabels)
    return tree_node


if __name__ == "__main__":
    from common.hgraph.hgraph import Hgraph
    graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))")
    td = tree_decomposition(graph)
Esempio n. 24
0
    subtrees.append(tree_node)

  return subtrees[0]

def tree_decomposition_edge(graph_edge, visited, amr, nodelabels = False):
  visited.add(graph_edge)
  tree_node = TreeNode()
  if nodelabels: 
      head = graph_edge[0][0]
      if graph_edge[2]:
          nodes, labels = zip(*graph_edge[2])
      else: 
          nodes = () 
  else:
      head = graph_edge[0]
      nodes = graph_edge[2]

  tree_node.graph_nodes.add(head)
  tree_node.graph_nodes |= set(nodes)


  tree_node.graph_edge = graph_edge
  tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels = nodelabels)
  return tree_node


if __name__ == "__main__":
    from common.hgraph.hgraph import Hgraph
    graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))")
    td = tree_decomposition(graph)
Esempio n. 25
0
 def get_graph(self, graph):
     trips = graph.triples()
     return Hgraph.from_triples(
         [trips[i] for i in range(len(trips)) if self.edges[i] == 1], {},
         warn=False)
Esempio n. 26
0
        def convert_chart(partition, external_nodes, nt, first=False):
            nt = NonterminalLabel(nt.label)  # Get rid of the index

            if partition in seen:
                node = seen[partition]
                result.use_counts[node] += 1
                return node

            leaves = chart.tree.leaves()

            edges_in_partition = [
                graph_edge_list[i] for i in range(len(partition.edges))
                if partition.edges[i] == 1
            ]

            if not partition in chart:  # leaf

                graph = Hgraph.from_triples(edges_in_partition, {}, warn=False)
                graph.roots = graph.find_roots()
                graph.roots.sort(lambda x, y: node_order[x] - node_order[y])
                graph.external_nodes = external_nodes
                str_rhs = [
                    leaves[i]
                    for i in range(partition.str_start, partition.str_end + 1)
                ]
                rule = Rule(0, nt.label, graph, tuple(str_rhs), 1)
                rule_id = self.add_rule(rule)
                fragment = fragment_counter[0]
                result[fragment] = [(rule_id, [])]
                result.use_counts[fragment] += 1
                seen[partition] = fragment
                fragment_counter[0] += 1
                return fragment

            poss = []
            count = 0
            for possibility in chart[partition]:
                count += 1
                partition_graph = Hgraph.from_triples(
                    edges_in_partition, {},
                    warn=False)  # This is the parent graph
                partition_graph.roots = partition_graph.find_roots()
                partition_graph.roots.sort(
                    lambda x, y: node_order[x] - node_order[y])
                partition_graph.external_nodes = external_nodes
                children = []
                #print partition_graph.to_amr_string()

                spans_to_nt = {}
                old_pgraph = partition_graph

                index = 1
                for subpartition in possibility:  #These are the different sub-constituents

                    edges_in_subpartition = [
                        graph_edge_list[i]
                        for i in range(len(subpartition.edges))
                        if subpartition.edges[i] == 1
                    ]
                    if edges_in_subpartition:  # Some constituents do not have any edges aligned to them
                        sub_graph = Hgraph.from_triples(edges_in_subpartition,
                                                        {},
                                                        warn=False)
                        sub_graph.roots = sub_graph.find_roots()
                        sub_graph.roots.sort(
                            lambda x, y: node_order[x] - node_order[y])
                        external_node_list = partition_graph.find_external_nodes2(
                            sub_graph)
                        external_node_list.sort(
                            lambda x, y: node_order[x] - node_order[y])
                        sub_external_nodes = dict([
                            (k, v) for v, k in enumerate(external_node_list)
                        ])
                        sub_graph.external_nodes = sub_external_nodes
                        sub_nt = NonterminalLabel(
                            "%s%i" %
                            (subpartition.phrase, len(sub_external_nodes)),
                            index)
                        children.append(
                            convert_chart(subpartition, sub_external_nodes,
                                          sub_nt))  # Recursive call
                        old_pgraph = partition_graph
                        partition_graph = partition_graph.collapse_fragment2(
                            sub_graph,
                            sub_nt,
                            external=external_node_list,
                            warn=False)

                        spans_to_nt[subpartition.str_start] = (
                            sub_nt, subpartition.str_end)
                    else:
                        sub_nt = NonterminalLabel(subpartition.phrase, index)

                    #assert partition_graph.is_connected()
                    index += 1

                partition_graph.roots = partition_graph.find_roots()
                partition_graph.roots.sort(
                    lambda x, y: node_order[x] - node_order[y])

                # Assemble String rule
                str_rhs = []
                i = partition.str_start
                while i <= partition.str_end:
                    if i in spans_to_nt:
                        new_nt, i = spans_to_nt[i]
                        str_rhs.append(new_nt)
                    else:
                        str_rhs.append(leaves[i])
                    i = i + 1

                rule = Rule(0, nt.label, partition_graph, tuple(str_rhs), 1)
                rule_id = self.add_rule(rule)

                poss.append((rule_id, children))

            fragment = fragment_counter[0]
            result[fragment] = poss
            result.use_counts[fragment] += 1
            seen[partition] = fragment
            fragment_counter[0] += 1
            return fragment
Esempio n. 27
0
        def compute_chart(
            tree,
            graph,
            prefix=""
        ):  # Recursively compute the chart. Graph is the sub-graph we're considering, tree is the tree of spans.
            count[0] += 1

            triples = set(graph.triples())
            edge_vector = tuple(
                [1 if x in triples else 0 for x in graph_edge_list])

            leaves = tree.leaves()
            #if not isinstance(tree,fancy_tree.FancyTree):
            #    triples = set(graph.triples())
            #    edge_vector= tuple(1 for x in graph_edge_list if x in triples else 0)
            #    return Partition(tree.node, leaves[0], leaves[-1], edge_vector)
            #else:
            if len(tree) == 1 and not isinstance(tree[0],
                                                 fancy_tree.FancyTree):
                return Partition(tree.node, leaves[0], leaves[-1], edge_vector)

            # First get the set of aligned edges for this constituent and it's children
            aligned_edges_for_span = set([
                edge for token in tree.leaves()
                for edge in rev_alignments[token]
            ])

            partition_object = Partition(tree.node, leaves[0], leaves[-1],
                                         edge_vector)
            if partition_object not in chart:

                try:
                    possibilities = []
                    child_edgesets = []
                    # Compute edge set for each child
                    for t in tree:
                        edgeset = []
                        for l in t.leaves():
                            edgeset.extend(rev_alignments[l])
                        child_edgesets.append(edgeset)

                    # For each possible partitioning
                    for cparts in get_binarized_partitions(
                            graph, child_edgesets):
                        child_forests = []
                        for i in range(len(tree)):
                            childgraph = Hgraph.from_triples(cparts[i], {},
                                                             warn=False)
                            sub_forest = compute_chart(tree[i],
                                                       childgraph,
                                                       prefix=prefix + " ")
                            if len(chart) > MAX_CHART_SIZE:
                                raise ChartTooBigException, "Chart size exceeded 5000 entries. dropping this sentence."
                            child_forests.append(sub_forest)
                        possibilities.append(child_forests)

                    chart[partition_object] = possibilities
                except IncompatibleAlignmentException:
                    chart.inconsistent_alignment = (tree.node, leaves[0],
                                                    leaves[-1])

                    return partition_object
            return partition_object
Esempio n. 28
0
    def load_from_file(cls, in_file, rule_class = VoRule, reverse = False, nodelabels = False, logprob = False):
        """
        Loads a SHRG grammar from the given file. 
        See documentation for format details.
        
        rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph
        visit order (also used for strings). TdRule computes a tree decomposition on the first RHS
        when initialized.
        """

        output = Grammar(nodelabels = nodelabels, logprob = logprob)

        rule_count = 1
        line_count = 0
        is_synchronous = False

        rhs1_type = None
        rhs2_type = None

        buf = StringIO.StringIO() 

        for line in in_file: 
            line_count += 1
            l = line.strip()
            if l:
                if "#" in l: 
                    content, comment = l.split("#",1)
                else: 
                    content = l
                buf.write(content.strip())
                if ";" in content:
                    rulestring = buf.getvalue()
                    try:
                        content, weights = rulestring.split(";",1)            
                        weight = 0.0 if not weights else (float(weights) if logprob else math.log(float(weights)))
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Error near end of line." % (line_count, rule_count)
                   
                    try:  
                        lhs, rhsstring = content.split("->")
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count)
                    lhs = lhs.strip()
                    if rule_count == 1:
                        output.start_symbol = lhs
                    if "|" in rhsstring:
                        if not is_synchronous and rule_count > 1:
                            raise GrammarError,\
           "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = True
                        try:
                            rhs1,rhs2 = rhsstring.split("|")
                        except:
                            raise GrammarError,"Only up to two RHSs are allowed in grammar file."
                    else: 
                        if is_synchronous and rule_count > 0:
                            raise ParserError,\
            "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = False
                        rhs1 = rhsstring
                        rhs2 = None                               
                    
                    try:    # If the first graph in the file cannot be parsed, assume it's a string
                        r1  = Hgraph.from_string(rhs1)
                        r1_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges()])
                        if not rhs1_type:
                            rhs1_type = GRAPH_FORMAT
                    except (ParserError, IndexError), e: 
                        if rhs1_type == GRAPH_FORMAT:
                           raise ParserError,\
            "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                        else:
                           r1 = parse_string(rhs1) 
                           nts = [t for t in r1 if isinstance(t, NonterminalLabel)]
                           r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts])
                           rhs1_type = STRING_FORMAT
  
                    if is_synchronous:
                        try:    # If the first graph in the file cannot be parsed, assume it's a string
                            if rhs2_type: 
                                assert rhs2_type == GRAPH_FORMAT
                            r2  = Hgraph.from_string(rhs2)
                            r2_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges()])
                            if not rhs2_type:
                                rhs2_type = GRAPH_FORMAT
                        except (ParserError, IndexError, AssertionError), e: 
                            if rhs2_type == GRAPH_FORMAT:
                               raise ParserError,\
                "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                            else:
                               r2 = parse_string(rhs2) 
                               nts = [t for t in r2 if isinstance(t, NonterminalLabel)]
                               r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts])
                               rhs2_type = STRING_FORMAT

                        # Verify that nonterminals match up
                        if not r1_nts == r2_nts:
                            raise GrammarError, \
            "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts))
                    else: 
                        r2 = None
                    try:    
                        if is_synchronous and reverse: 
                            output[rule_count] = rule_class(rule_count, lhs, weight, r2, r1, nodelabels = nodelabels, logprob = logprob)                                     
                        else: 
                            output[rule_count] = rule_class(rule_count, lhs, weight, r1, r2, nodelabels = nodelabels, logprob = logprob) 
                    except Exception, e:         
                        raise GrammarError, \
            "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message)
                    buf = StringIO.StringIO() 
                    rule_count += 1
Esempio n. 29
0
        def convert_chart(partition, external_nodes, nt, first=False):
            nt = NonterminalLabel(nt.label)  # Get rid of the index

            if partition in seen:
                node = seen[partition]
                result.use_counts[node] += 1
                return node

            leaves = chart.tree.leaves()

            edges_in_partition = [graph_edge_list[i] for i in range(len(partition.edges)) if partition.edges[i] == 1]

            if not partition in chart:  # leaf

                graph = Hgraph.from_triples(edges_in_partition, {}, warn=False)
                graph.roots = graph.find_roots()
                graph.roots.sort(lambda x, y: node_order[x] - node_order[y])
                graph.external_nodes = external_nodes
                str_rhs = [leaves[i] for i in range(partition.str_start, partition.str_end + 1)]
                rule = Rule(0, nt.label, graph, tuple(str_rhs), 1)
                rule_id = self.add_rule(rule)
                fragment = fragment_counter[0]
                result[fragment] = [(rule_id, [])]
                result.use_counts[fragment] += 1
                seen[partition] = fragment
                fragment_counter[0] += 1
                return fragment

            poss = []
            count = 0
            for possibility in chart[partition]:
                count += 1
                partition_graph = Hgraph.from_triples(edges_in_partition, {}, warn=False)  # This is the parent graph
                partition_graph.roots = partition_graph.find_roots()
                partition_graph.roots.sort(lambda x, y: node_order[x] - node_order[y])
                partition_graph.external_nodes = external_nodes
                children = []
                # print partition_graph.to_amr_string()

                spans_to_nt = {}
                old_pgraph = partition_graph

                index = 1
                for subpartition in possibility:  # These are the different sub-constituents

                    edges_in_subpartition = [
                        graph_edge_list[i] for i in range(len(subpartition.edges)) if subpartition.edges[i] == 1
                    ]
                    if edges_in_subpartition:  # Some constituents do not have any edges aligned to them
                        sub_graph = Hgraph.from_triples(edges_in_subpartition, {}, warn=False)
                        sub_graph.roots = sub_graph.find_roots()
                        sub_graph.roots.sort(lambda x, y: node_order[x] - node_order[y])
                        external_node_list = partition_graph.find_external_nodes2(sub_graph)
                        external_node_list.sort(lambda x, y: node_order[x] - node_order[y])
                        sub_external_nodes = dict([(k, v) for v, k in enumerate(external_node_list)])
                        sub_graph.external_nodes = sub_external_nodes
                        sub_nt = NonterminalLabel("%s%i" % (subpartition.phrase, len(sub_external_nodes)), index)
                        children.append(convert_chart(subpartition, sub_external_nodes, sub_nt))  # Recursive call
                        old_pgraph = partition_graph
                        partition_graph = partition_graph.collapse_fragment2(
                            sub_graph, sub_nt, external=external_node_list, warn=False
                        )

                        spans_to_nt[subpartition.str_start] = (sub_nt, subpartition.str_end)
                    else:
                        sub_nt = NonterminalLabel(subpartition.phrase, index)

                    # assert partition_graph.is_connected()
                    index += 1

                partition_graph.roots = partition_graph.find_roots()
                partition_graph.roots.sort(lambda x, y: node_order[x] - node_order[y])

                # Assemble String rule
                str_rhs = []
                i = partition.str_start
                while i <= partition.str_end:
                    if i in spans_to_nt:
                        new_nt, i = spans_to_nt[i]
                        str_rhs.append(new_nt)
                    else:
                        str_rhs.append(leaves[i])
                    i = i + 1

                rule = Rule(0, nt.label, partition_graph, tuple(str_rhs), 1)
                rule_id = self.add_rule(rule)

                poss.append((rule_id, children))

            fragment = fragment_counter[0]
            result[fragment] = poss
            result.use_counts[fragment] += 1
            seen[partition] = fragment
            fragment_counter[0] += 1
            return fragment
Esempio n. 30
0
 def get_graph(self, graph):
     trips = graph.triples()
     return Hgraph.from_triples([trips[i] for i in range(len(trips)) if self.edges[i] == 1], {}, warn=False)
Esempio n. 31
0
        log.info("Loaded %s%s grammar with %i rules."\
            % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar)))
 

        # EM training 
        if config.train:
            iterations = config.train
            if not config.input_file: 
                log.err("Please specify corpus file for EM training.")
                sys.exit(1)
            if config.bitext:
                corpus = list(read_pairs(fileinput.input(config.input_file)))
                grammar.em(corpus, iterations, parser_class, "synchronous")
            else: 
                corpus = [Hgraph.from_string(x) for x in fileinput.input(config.input_file)]
                grammar.em(corpus, iterations, parser_class, "forward")
            for rid in sorted(grammar.keys()): 
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
            sys.exit(0)

        # Normalization
        if config.normalize:
            if config.bitext or grammar.rhs2_type is None or config.g or (config.k and not config.input_files):
                grammar.normalize_lhs()
            else:
                grammar.normalize_rhs2()
            for rid in sorted(grammar.keys()): 
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
Esempio n. 32
0
        log.info("Loaded %s%s grammar with %i rules."\
            % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar)))

        # EM training
        if config.train:
            iterations = config.train
            if not config.input_file:
                log.err("Please specify corpus file for EM training.")
                sys.exit(1)
            if config.bitext:
                corpus = list(read_pairs(fileinput.input(config.input_file)))
                grammar.em(corpus, iterations, parser_class, "synchronous")
            else:
                corpus = [
                    Hgraph.from_string(x)
                    for x in fileinput.input(config.input_file)
                ]
                grammar.em(corpus, iterations, parser_class, "forward")
            for rid in sorted(grammar.keys()):
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
            sys.exit(0)

        # Normalization
        if config.normalize:
            if config.bitext or grammar.rhs2_type is None or config.g or (
                    config.k and not config.input_files):
                grammar.normalize_lhs()
            else:
                grammar.normalize_rhs2()
Esempio n. 33
0
    def load_from_file(cls,
                       in_file,
                       rule_class=VoRule,
                       reverse=False,
                       nodelabels=False,
                       logprob=False):
        """
        Loads a SHRG grammar from the given file. 
        See documentation for format details.
        
        rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph
        visit order (also used for strings). TdRule computes a tree decomposition on the first RHS
        when initialized.
        """

        output = Grammar(nodelabels=nodelabels, logprob=logprob)

        rule_count = 1
        line_count = 0
        is_synchronous = False

        rhs1_type = None
        rhs2_type = None

        buf = StringIO.StringIO()

        for line in in_file:
            line_count += 1
            l = line.strip()
            if l:
                if "#" in l:
                    content, comment = l.split("#", 1)
                else:
                    content = l
                buf.write(content.strip())
                if ";" in content:
                    rulestring = buf.getvalue()
                    try:
                        content, weights = rulestring.split(";", 1)
                        weight = 0.0 if not weights else (float(
                            weights) if logprob else math.log(float(weights)))
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Error near end of line." % (line_count, rule_count)

                    try:
                        lhs, rhsstring = content.split("->")
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count)
                    lhs = lhs.strip()
                    if rule_count == 1:
                        output.start_symbol = lhs
                    if "|" in rhsstring:
                        if not is_synchronous and rule_count > 1:
                            raise GrammarError,\
           "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = True
                        try:
                            rhs1, rhs2 = rhsstring.split("|")
                        except:
                            raise GrammarError, "Only up to two RHSs are allowed in grammar file."
                    else:
                        if is_synchronous and rule_count > 0:
                            raise ParserError,\
            "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = False
                        rhs1 = rhsstring
                        rhs2 = None

                    try:  # If the first graph in the file cannot be parsed, assume it's a string
                        r1 = Hgraph.from_string(rhs1)
                        r1_nts = set([
                            (ntlabel.label, ntlabel.index)
                            for h, ntlabel, t in r1.nonterminal_edges()
                        ])
                        if not rhs1_type:
                            rhs1_type = GRAPH_FORMAT
                    except (ParserError, IndexError), e:
                        if rhs1_type == GRAPH_FORMAT:
                            raise ParserError,\
             "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                        else:
                            r1 = parse_string(rhs1)
                            nts = [
                                t for t in r1
                                if isinstance(t, NonterminalLabel)
                            ]
                            r1_nts = set([(ntlabel.label, ntlabel.index)
                                          for ntlabel in nts])
                            rhs1_type = STRING_FORMAT

                    if is_synchronous:
                        try:  # If the first graph in the file cannot be parsed, assume it's a string
                            if rhs2_type:
                                assert rhs2_type == GRAPH_FORMAT
                            r2 = Hgraph.from_string(rhs2)
                            r2_nts = set([
                                (ntlabel.label, ntlabel.index)
                                for h, ntlabel, t in r2.nonterminal_edges()
                            ])
                            if not rhs2_type:
                                rhs2_type = GRAPH_FORMAT
                        except (ParserError, IndexError, AssertionError), e:
                            if rhs2_type == GRAPH_FORMAT:
                                raise ParserError,\
                 "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                            else:
                                r2 = parse_string(rhs2)
                                nts = [
                                    t for t in r2
                                    if isinstance(t, NonterminalLabel)
                                ]
                                r2_nts = set([(ntlabel.label, ntlabel.index)
                                              for ntlabel in nts])
                                rhs2_type = STRING_FORMAT

                        # Verify that nonterminals match up
                        if not r1_nts == r2_nts:
                            raise GrammarError, \
            "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts))
                    else:
                        r2 = None
                    try:
                        if is_synchronous and reverse:
                            output[rule_count] = rule_class(
                                rule_count,
                                lhs,
                                weight,
                                r2,
                                r1,
                                nodelabels=nodelabels,
                                logprob=logprob)
                        else:
                            output[rule_count] = rule_class(
                                rule_count,
                                lhs,
                                weight,
                                r1,
                                r2,
                                nodelabels=nodelabels,
                                logprob=logprob)
                    except Exception, e:
                        raise GrammarError, \
            "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message)
                    buf = StringIO.StringIO()
                    rule_count += 1