def delete_nodes(graph, nodes): g = Hgraph() for p, r, ch in graph.triples(): if (p not in nodes) and (not (len(ch) == 1 and ch[0] in nodes)): g._add_triple(p, r, ch, warn=False) else: if (p not in nodes) and (p not in g): g[p] = ListMap() if len(ch) == 1 and (ch[0] not in nodes) and (ch[0] not in g): g[ch[0]] = ListMap() g.roots = g.find_roots(warn=False) return g
def to_hgraph(self): from common.hgraph.hgraph import Hgraph hgraph = Hgraph() hgraph.my_hyper_graph = self for node in self.nodes: # type: GraphNode label = "" ext_id = None ident = "_" + node.name # Insert a node into the AMR ignoreme = hgraph[ident] # Initialize dictionary for this node hgraph.node_to_concepts[ident] = label if ext_id is not None: if ident in hgraph.external_nodes and hgraph.external_nodes[ ident] != ext_id: raise Exception( "Incompatible external node IDs for node %s." % ident) hgraph.external_nodes[ident] = ext_id hgraph.rev_external_nodes[ext_id] = ident if node.is_root: hgraph.roots.append(ident) for edge in self.edges: # type: HyperEdge hyperchild = tuple("_" + node.name for node in edge.nodes[1:]) ident = "_" + edge.nodes[0].name new_edge = edge.label hgraph._add_triple(ident, new_edge, hyperchild) return hgraph
def get_line_graph(graph): lgraph = Hgraph() edges_for_node = defaultdict(list) for p, r, ch in graph.triples(): edges_for_node[p].append(str((p, r, ch))) for c in ch: edges_for_node[c].append(str((p, r, ch))) for r in edges_for_node: for p, c in itertools.combinations(edges_for_node[r], 2): lgraph._add_triple(p, r, (c,), warn=False) lgraph._add_triple(c, r, (p,), warn=False) lgraph.roots = lgraph.find_roots() return lgraph
def parse_bitexts(self, pair_iterator): """ Parse all pairs of input objects returned by the pair iterator. This is a generator. """ for line1, line2 in pair_iterator: if self.grammar.rhs1_type == "hypergraph": obj1 = Hgraph.from_string(line1) else: obj1 = line1.strip().split() if self.grammar.rhs2_type == "hypergraph": obj2 = Hgraph.from_string(line2) else: obj2 = line2.strip().split() raw_chart = self.parse_bitext(obj1, obj2) yield cky_chart(raw_chart)
def combiner(item, childobjs): graph = leaf(item) for nt, cgraph in childobjs.items(): p,r,c = graph.find_nt_edge(*nt) fragment = Hgraph.from_triples([(p,r,c)],graph.node_to_concepts) try: graph = graph.replace_fragment(fragment, cgraph) except AssertionError, e: raise DerivationException, "Incompatible hyperedge type for nonterminal %s." % str(nt[0])
def combiner(item, childobjs): graph = leaf(item) for nt, cgraph in childobjs.items(): p, r, c = graph.find_nt_edge(*nt) fragment = Hgraph.from_triples([(p, r, c)], graph.node_to_concepts) try: graph = graph.replace_fragment(fragment, cgraph) except AssertionError, e: raise DerivationException, "Incompatible hyperedge type for nonterminal %s." % str( nt[0])
def get_binarized_partitions(graph, edgesets): if len(edgesets) == 1: yield [graph.triples()] return gen = get_partitions(graph, edgesets[0], [edge for edgeset in edgesets[1:] for edge in edgeset]) for left_edges, right_edges in gen: possibilities = get_binarized_partitions(Hgraph.from_triples(right_edges, {}, warn=False), edgesets[1:]) poss_list = list(possibilities) for partitions in poss_list: yield [left_edges] + partitions
def compute_chart( tree, graph, prefix="" ): # Recursively compute the chart. Graph is the sub-graph we're considering, tree is the tree of spans. count[0] += 1 triples = set(graph.triples()) edge_vector = tuple([1 if x in triples else 0 for x in graph_edge_list]) leaves = tree.leaves() # if not isinstance(tree,fancy_tree.FancyTree): # triples = set(graph.triples()) # edge_vector= tuple(1 for x in graph_edge_list if x in triples else 0) # return Partition(tree.node, leaves[0], leaves[-1], edge_vector) # else: if len(tree) == 1 and not isinstance(tree[0], fancy_tree.FancyTree): return Partition(tree.node, leaves[0], leaves[-1], edge_vector) # First get the set of aligned edges for this constituent and it's children aligned_edges_for_span = set([edge for token in tree.leaves() for edge in rev_alignments[token]]) partition_object = Partition(tree.node, leaves[0], leaves[-1], edge_vector) if partition_object not in chart: try: possibilities = [] child_edgesets = [] # Compute edge set for each child for t in tree: edgeset = [] for l in t.leaves(): edgeset.extend(rev_alignments[l]) child_edgesets.append(edgeset) # For each possible partitioning for cparts in get_binarized_partitions(graph, child_edgesets): child_forests = [] for i in range(len(tree)): childgraph = Hgraph.from_triples(cparts[i], {}, warn=False) sub_forest = compute_chart(tree[i], childgraph, prefix=prefix + " ") if len(chart) > MAX_CHART_SIZE: raise ChartTooBigException, "Chart size exceeded 5000 entries. dropping this sentence." child_forests.append(sub_forest) possibilities.append(child_forests) chart[partition_object] = possibilities except IncompatibleAlignmentException: chart.inconsistent_alignment = (tree.node, leaves[0], leaves[-1]) return partition_object return partition_object
def test(): tree = FancyTree(""" (S (NP (DT The) (NN boy)) (VP (VBZ wants) (NP (DT the) (NN girl) (S (VP (TO to) (VP (VB believe) (NP (PRP him))))))) (. .))""") graph = Hgraph.from_string( "(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))") graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]} graph = graph.to_instance_edges()
def test(): tree = FancyTree( """ (S (NP (DT The) (NN boy)) (VP (VBZ wants) (NP (DT the) (NN girl) (S (VP (TO to) (VP (VB believe) (NP (PRP him))))))) (. .))""" ) graph = Hgraph.from_string("(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))") graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]} graph = graph.to_instance_edges()
def get_binarized_partitions(graph, edgesets): if len(edgesets) == 1: yield [graph.triples()] return gen = get_partitions( graph, edgesets[0], [edge for edgeset in edgesets[1:] for edge in edgeset]) for left_edges, right_edges in gen: possibilities = get_binarized_partitions( Hgraph.from_triples(right_edges, {}, warn=False), edgesets[1:]) poss_list = list(possibilities) for partitions in poss_list: yield [left_edges] + partitions
def merge_string_nonterminals(self, string, amr, next_id): """ Binarizes a string-graph pair consisting entirely of nonterminals, ensuring correct visit order for parsing. """ rules = [] stack = [] tokens = list(reversed([s for s in string if s])) # standard shift-reduce binarization algorithm # TODO add citation after paper is published while tokens: next_tok = tokens.pop() next_tok_triple_l = [t for t in amr.triples() if str(t[1]) == next_tok] assert len(next_tok_triple_l) == 1 next_tok_triple = next_tok_triple_l[0] if not stack: stack.append(next_tok) continue stack_top = stack.pop() stack_top_triple = [t for t in amr.triples() if str(t[1]) == stack_top][0] if (stack_top_triple[0] not in next_tok_triple[2]) and \ (next_tok_triple[0] not in stack_top_triple[2]): # can't merge, so shift stack.append(stack_top) stack.append(next_tok) continue # can merge, so reduce rule_amr = Hgraph.from_triples([stack_top_triple, next_tok_triple]) assert len(rule_amr.roots) == 1 rule_string = [stack_top, next_tok] fictitious_tree = self.make_fictitious_tree(string, rule_string) new_rule, tree, amr, next_id = self.make_rule(fictitious_tree, amr, Tree('X', rule_string), rule_amr, next_id) string = tree.leaves() tokens.append('#%s' % new_rule.symbol) rules.append(new_rule) if len(stack) > 1: raise BinarizationException return string, amr, rules, next_id
def replace_instance_edges(graph, tree): t = [] alignments = {} tree_leaves = tree.leaves() for e in graph.triples(): if e in graph.edge_alignments: p, r, ch = e token = graph.edge_alignments[e][0] # TODO: not sure what to do with multiple tokens new_edge = (p, "%s'" % tree_leaves[token], ch) t.append(new_edge) alignments[new_edge] = [token] else: t.append(e) res = Hgraph.from_triples(t, {}, warn=False) res.edge_alignments = alignments res.node_alignments = graph.node_alignments res.roots = graph.roots res.external_nodes = graph.external_nodes return res
def collapse_amr_terminals(self, tree, amr, next_id): """ Creates new rules by merging terminal subgraphs with their closest nonterminal edge. """ # triples returns in breadth-first order, so first triples in the list are # closest to the root of the AMR nonterminals = list(reversed([t for t in amr.triples() if isinstance(t[1], NonterminalLabel)])) rules = [] first = True while nonterminals: nt = nonterminals.pop() # in general, we will attach to a given nonterminal edge all of the # terminal edges reachable from its tail nodes attached_terminals = self.terminal_search(nt, amr.triples()) if first: # we still have to handle terminal edges that are higher than any # nonterminal edge # because the first nonterminal edge is closest to the root of the AMR, # it must be reachable from the root without passing through any other # nonterminal, so we can attach all the high terminals (those reachable # from the root) to the first nonterminal attached_terminals |= self.terminal_search(amr.root_edges()[0], amr.triples()) attached_terminals |= {amr.root_edges()[0]} first = False # don't bother making a rule when there's nothing to collapse if not attached_terminals: continue rule_amr = Hgraph.from_triples({nt} | attached_terminals) rule_tree = str(nt[1]) assert len(rule_amr.roots) == 1 new_rule, tree, amr, next_id = self.make_rule(tree, amr, rule_tree, rule_amr, next_id) rules.append(new_rule) return tree, amr, rules, next_id
def replace_instance_edges(graph, tree): t = [] alignments = {} tree_leaves = tree.leaves() for e in graph.triples(): if e in graph.edge_alignments: p, r, ch = e token = graph.edge_alignments[e][ 0] # TODO: not sure what to do with multiple tokens new_edge = (p, "%s'" % tree_leaves[token], ch) t.append(new_edge) alignments[new_edge] = [token] else: t.append(e) res = Hgraph.from_triples(t, {}, warn=False) res.edge_alignments = alignments res.node_alignments = graph.node_alignments res.roots = graph.roots res.external_nodes = graph.external_nodes return res
def rhs_to_hgraph(self): from common.cfg import NonterminalLabel from common.hgraph.hgraph import Hgraph nt_id_count = 0 hgraph = Hgraph() for node in self.rhs.nodes: # type: GraphNode label = "" try: ext_id = self.lhs.nodes.index(node) except ValueError: ext_id = None ident = "_" + node.name # Insert a node into the AMR ignoreme = hgraph[ident] # Initialize dictionary for this node hgraph.node_to_concepts[ident] = label if ext_id is not None: if ident in hgraph.external_nodes and hgraph.external_nodes[ ident] != ext_id: raise Exception( "Incompatible external node IDs for node %s." % ident) hgraph.external_nodes[ident] = ext_id hgraph.rev_external_nodes[ext_id] = ident if ext_id == 0: hgraph.roots.append(ident) for edge in self.rhs.edges: # type: HyperEdge hyperchild = tuple("_" + node.name for node in edge.nodes[1:]) ident = "_" + edge.nodes[0].name if "_" not in edge.label and not edge.label.startswith("ARG") \ and not edge.label.startswith("BV"): # this is a nonterminal Edge new_edge = NonterminalLabel(edge.label) if not new_edge.index: new_edge.index = "_%i" % nt_id_count nt_id_count = nt_id_count + 1 else: new_edge = edge.label hgraph._add_triple(ident, new_edge, hyperchild) return hgraph
def get_line_graph(graph): lgraph = Hgraph() edges_for_node = defaultdict(list) for p, r, ch in graph.triples(): edges_for_node[p].append(str((p, r, ch))) for c in ch: edges_for_node[c].append(str((p, r, ch))) for r in edges_for_node: for p, c in itertools.combinations(edges_for_node[r], 2): lgraph._add_triple(p, r, (c, ), warn=False) lgraph._add_triple(c, r, (p, ), warn=False) lgraph.roots = lgraph.find_roots() return lgraph
def merge_tree_symbols(self, tree, amr, next_id): """ Binarizes a tree-graph pair according to the binariziation dictated by the tree. WILL FAIL OFTEN IF TREE IS NOT BINARIZED. """ rules = [] while True: if not isinstance(tree, Tree): assert len(amr.triples()) == 1 return tree, amr, rules, next_id # a collapsible subtree consists of # 1. many terminals # 2. one nonterminal and many terminals # 3. two nonterminals collapsible_subtrees = [] for st in tree.subtrees(): terminals = [t for t in st.leaves() if t[0] == '#'] if len(terminals) == 1: collapsible_subtrees.append(st) elif len(terminals) == 2 and len(st.leaves()) == 2: collapsible_subtrees.append(st) # if there are no subtrees to collapse, this rule isn't binarizable if len(collapsible_subtrees) == 0: raise BinarizationException rule_tree = max(collapsible_subtrees, key=lambda x: x.height()) terminals = [t for t in rule_tree.leaves() if t[0] == '#'] rule_edge_l = [t for t in amr.triples() if str(t[1]) in terminals] rule_amr = Hgraph.from_triples(rule_edge_l) # if the induced graph is disconnected, this rule isn't binarizable if len(rule_amr.roots) != 1: raise BinarizationException new_rule, tree, amr, next_id = self.make_rule(tree, amr, rule_tree, rule_amr, next_id) rules.append(new_rule) return tree, amr, rules, next_id
def collapse_string_terminals(self, string, amr, next_id): """ Creates new rules by merging terminal tokens with their closest nonterminal. All terminals attach to the left (except for terminals left of the first nonterminal, which attach right). """ nonterminals = list(reversed([t for t in string if t[0] == '#'])) rules = [] # attach first terminals to the right slice_from = 0 while nonterminals: nt = nonterminals.pop() if nonterminals: slice_to = string.index(nonterminals[-1]) else: slice_to = len(string) if slice_to - slice_from == 1: # there are no terminals to attach here, so skip ahead slice_from = slice_to continue rule_string = string[slice_from:slice_to] nt_edge_l = [e for e in amr.triples(nodelabels = self.nodelabels) if str(e[1]) == nt] assert len(nt_edge_l) == 1 rule_amr = Hgraph.from_triples(nt_edge_l) # hallucinate a tree with acceptable structure for make_rule fictitious_tree = self.make_fictitious_tree(string, rule_string) new_rule, tree, amr, next_id = self.make_rule(fictitious_tree, amr, Tree('X', rule_string), rule_amr, next_id) string = tree.leaves() rules.append(new_rule) slice_from = slice_from + 1 return string, amr, rules, next_id
def main(): graphs = set([line.strip().split('\t')[0] for line in file(sys.argv[1])]) for i, graph in enumerate(graphs): g = Hgraph.from_string(graph) g.render_to_file("{0}_{1}.jpg".format(sys.argv[2], i))
def tree_decomposition_edge(graph_edge, visited, amr, nodelabels=False): visited.add(graph_edge) tree_node = TreeNode() if nodelabels: head = graph_edge[0][0] if graph_edge[2]: nodes, labels = zip(*graph_edge[2]) else: nodes = () else: head = graph_edge[0] nodes = graph_edge[2] tree_node.graph_nodes.add(head) tree_node.graph_nodes |= set(nodes) tree_node.graph_edge = graph_edge tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels=nodelabels) return tree_node if __name__ == "__main__": from common.hgraph.hgraph import Hgraph graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))") td = tree_decomposition(graph)
subtrees.append(tree_node) return subtrees[0] def tree_decomposition_edge(graph_edge, visited, amr, nodelabels = False): visited.add(graph_edge) tree_node = TreeNode() if nodelabels: head = graph_edge[0][0] if graph_edge[2]: nodes, labels = zip(*graph_edge[2]) else: nodes = () else: head = graph_edge[0] nodes = graph_edge[2] tree_node.graph_nodes.add(head) tree_node.graph_nodes |= set(nodes) tree_node.graph_edge = graph_edge tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels = nodelabels) return tree_node if __name__ == "__main__": from common.hgraph.hgraph import Hgraph graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))") td = tree_decomposition(graph)
def get_graph(self, graph): trips = graph.triples() return Hgraph.from_triples( [trips[i] for i in range(len(trips)) if self.edges[i] == 1], {}, warn=False)
def convert_chart(partition, external_nodes, nt, first=False): nt = NonterminalLabel(nt.label) # Get rid of the index if partition in seen: node = seen[partition] result.use_counts[node] += 1 return node leaves = chart.tree.leaves() edges_in_partition = [ graph_edge_list[i] for i in range(len(partition.edges)) if partition.edges[i] == 1 ] if not partition in chart: # leaf graph = Hgraph.from_triples(edges_in_partition, {}, warn=False) graph.roots = graph.find_roots() graph.roots.sort(lambda x, y: node_order[x] - node_order[y]) graph.external_nodes = external_nodes str_rhs = [ leaves[i] for i in range(partition.str_start, partition.str_end + 1) ] rule = Rule(0, nt.label, graph, tuple(str_rhs), 1) rule_id = self.add_rule(rule) fragment = fragment_counter[0] result[fragment] = [(rule_id, [])] result.use_counts[fragment] += 1 seen[partition] = fragment fragment_counter[0] += 1 return fragment poss = [] count = 0 for possibility in chart[partition]: count += 1 partition_graph = Hgraph.from_triples( edges_in_partition, {}, warn=False) # This is the parent graph partition_graph.roots = partition_graph.find_roots() partition_graph.roots.sort( lambda x, y: node_order[x] - node_order[y]) partition_graph.external_nodes = external_nodes children = [] #print partition_graph.to_amr_string() spans_to_nt = {} old_pgraph = partition_graph index = 1 for subpartition in possibility: #These are the different sub-constituents edges_in_subpartition = [ graph_edge_list[i] for i in range(len(subpartition.edges)) if subpartition.edges[i] == 1 ] if edges_in_subpartition: # Some constituents do not have any edges aligned to them sub_graph = Hgraph.from_triples(edges_in_subpartition, {}, warn=False) sub_graph.roots = sub_graph.find_roots() sub_graph.roots.sort( lambda x, y: node_order[x] - node_order[y]) external_node_list = partition_graph.find_external_nodes2( sub_graph) external_node_list.sort( lambda x, y: node_order[x] - node_order[y]) sub_external_nodes = dict([ (k, v) for v, k in enumerate(external_node_list) ]) sub_graph.external_nodes = sub_external_nodes sub_nt = NonterminalLabel( "%s%i" % (subpartition.phrase, len(sub_external_nodes)), index) children.append( convert_chart(subpartition, sub_external_nodes, sub_nt)) # Recursive call old_pgraph = partition_graph partition_graph = partition_graph.collapse_fragment2( sub_graph, sub_nt, external=external_node_list, warn=False) spans_to_nt[subpartition.str_start] = ( sub_nt, subpartition.str_end) else: sub_nt = NonterminalLabel(subpartition.phrase, index) #assert partition_graph.is_connected() index += 1 partition_graph.roots = partition_graph.find_roots() partition_graph.roots.sort( lambda x, y: node_order[x] - node_order[y]) # Assemble String rule str_rhs = [] i = partition.str_start while i <= partition.str_end: if i in spans_to_nt: new_nt, i = spans_to_nt[i] str_rhs.append(new_nt) else: str_rhs.append(leaves[i]) i = i + 1 rule = Rule(0, nt.label, partition_graph, tuple(str_rhs), 1) rule_id = self.add_rule(rule) poss.append((rule_id, children)) fragment = fragment_counter[0] result[fragment] = poss result.use_counts[fragment] += 1 seen[partition] = fragment fragment_counter[0] += 1 return fragment
def compute_chart( tree, graph, prefix="" ): # Recursively compute the chart. Graph is the sub-graph we're considering, tree is the tree of spans. count[0] += 1 triples = set(graph.triples()) edge_vector = tuple( [1 if x in triples else 0 for x in graph_edge_list]) leaves = tree.leaves() #if not isinstance(tree,fancy_tree.FancyTree): # triples = set(graph.triples()) # edge_vector= tuple(1 for x in graph_edge_list if x in triples else 0) # return Partition(tree.node, leaves[0], leaves[-1], edge_vector) #else: if len(tree) == 1 and not isinstance(tree[0], fancy_tree.FancyTree): return Partition(tree.node, leaves[0], leaves[-1], edge_vector) # First get the set of aligned edges for this constituent and it's children aligned_edges_for_span = set([ edge for token in tree.leaves() for edge in rev_alignments[token] ]) partition_object = Partition(tree.node, leaves[0], leaves[-1], edge_vector) if partition_object not in chart: try: possibilities = [] child_edgesets = [] # Compute edge set for each child for t in tree: edgeset = [] for l in t.leaves(): edgeset.extend(rev_alignments[l]) child_edgesets.append(edgeset) # For each possible partitioning for cparts in get_binarized_partitions( graph, child_edgesets): child_forests = [] for i in range(len(tree)): childgraph = Hgraph.from_triples(cparts[i], {}, warn=False) sub_forest = compute_chart(tree[i], childgraph, prefix=prefix + " ") if len(chart) > MAX_CHART_SIZE: raise ChartTooBigException, "Chart size exceeded 5000 entries. dropping this sentence." child_forests.append(sub_forest) possibilities.append(child_forests) chart[partition_object] = possibilities except IncompatibleAlignmentException: chart.inconsistent_alignment = (tree.node, leaves[0], leaves[-1]) return partition_object return partition_object
def load_from_file(cls, in_file, rule_class = VoRule, reverse = False, nodelabels = False, logprob = False): """ Loads a SHRG grammar from the given file. See documentation for format details. rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph visit order (also used for strings). TdRule computes a tree decomposition on the first RHS when initialized. """ output = Grammar(nodelabels = nodelabels, logprob = logprob) rule_count = 1 line_count = 0 is_synchronous = False rhs1_type = None rhs2_type = None buf = StringIO.StringIO() for line in in_file: line_count += 1 l = line.strip() if l: if "#" in l: content, comment = l.split("#",1) else: content = l buf.write(content.strip()) if ";" in content: rulestring = buf.getvalue() try: content, weights = rulestring.split(";",1) weight = 0.0 if not weights else (float(weights) if logprob else math.log(float(weights))) except: raise GrammarError, \ "Line %i, Rule %i: Error near end of line." % (line_count, rule_count) try: lhs, rhsstring = content.split("->") except: raise GrammarError, \ "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count) lhs = lhs.strip() if rule_count == 1: output.start_symbol = lhs if "|" in rhsstring: if not is_synchronous and rule_count > 1: raise GrammarError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = True try: rhs1,rhs2 = rhsstring.split("|") except: raise GrammarError,"Only up to two RHSs are allowed in grammar file." else: if is_synchronous and rule_count > 0: raise ParserError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = False rhs1 = rhsstring rhs2 = None try: # If the first graph in the file cannot be parsed, assume it's a string r1 = Hgraph.from_string(rhs1) r1_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges()]) if not rhs1_type: rhs1_type = GRAPH_FORMAT except (ParserError, IndexError), e: if rhs1_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r1 = parse_string(rhs1) nts = [t for t in r1 if isinstance(t, NonterminalLabel)] r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs1_type = STRING_FORMAT if is_synchronous: try: # If the first graph in the file cannot be parsed, assume it's a string if rhs2_type: assert rhs2_type == GRAPH_FORMAT r2 = Hgraph.from_string(rhs2) r2_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges()]) if not rhs2_type: rhs2_type = GRAPH_FORMAT except (ParserError, IndexError, AssertionError), e: if rhs2_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r2 = parse_string(rhs2) nts = [t for t in r2 if isinstance(t, NonterminalLabel)] r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs2_type = STRING_FORMAT # Verify that nonterminals match up if not r1_nts == r2_nts: raise GrammarError, \ "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts)) else: r2 = None try: if is_synchronous and reverse: output[rule_count] = rule_class(rule_count, lhs, weight, r2, r1, nodelabels = nodelabels, logprob = logprob) else: output[rule_count] = rule_class(rule_count, lhs, weight, r1, r2, nodelabels = nodelabels, logprob = logprob) except Exception, e: raise GrammarError, \ "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message) buf = StringIO.StringIO() rule_count += 1
def convert_chart(partition, external_nodes, nt, first=False): nt = NonterminalLabel(nt.label) # Get rid of the index if partition in seen: node = seen[partition] result.use_counts[node] += 1 return node leaves = chart.tree.leaves() edges_in_partition = [graph_edge_list[i] for i in range(len(partition.edges)) if partition.edges[i] == 1] if not partition in chart: # leaf graph = Hgraph.from_triples(edges_in_partition, {}, warn=False) graph.roots = graph.find_roots() graph.roots.sort(lambda x, y: node_order[x] - node_order[y]) graph.external_nodes = external_nodes str_rhs = [leaves[i] for i in range(partition.str_start, partition.str_end + 1)] rule = Rule(0, nt.label, graph, tuple(str_rhs), 1) rule_id = self.add_rule(rule) fragment = fragment_counter[0] result[fragment] = [(rule_id, [])] result.use_counts[fragment] += 1 seen[partition] = fragment fragment_counter[0] += 1 return fragment poss = [] count = 0 for possibility in chart[partition]: count += 1 partition_graph = Hgraph.from_triples(edges_in_partition, {}, warn=False) # This is the parent graph partition_graph.roots = partition_graph.find_roots() partition_graph.roots.sort(lambda x, y: node_order[x] - node_order[y]) partition_graph.external_nodes = external_nodes children = [] # print partition_graph.to_amr_string() spans_to_nt = {} old_pgraph = partition_graph index = 1 for subpartition in possibility: # These are the different sub-constituents edges_in_subpartition = [ graph_edge_list[i] for i in range(len(subpartition.edges)) if subpartition.edges[i] == 1 ] if edges_in_subpartition: # Some constituents do not have any edges aligned to them sub_graph = Hgraph.from_triples(edges_in_subpartition, {}, warn=False) sub_graph.roots = sub_graph.find_roots() sub_graph.roots.sort(lambda x, y: node_order[x] - node_order[y]) external_node_list = partition_graph.find_external_nodes2(sub_graph) external_node_list.sort(lambda x, y: node_order[x] - node_order[y]) sub_external_nodes = dict([(k, v) for v, k in enumerate(external_node_list)]) sub_graph.external_nodes = sub_external_nodes sub_nt = NonterminalLabel("%s%i" % (subpartition.phrase, len(sub_external_nodes)), index) children.append(convert_chart(subpartition, sub_external_nodes, sub_nt)) # Recursive call old_pgraph = partition_graph partition_graph = partition_graph.collapse_fragment2( sub_graph, sub_nt, external=external_node_list, warn=False ) spans_to_nt[subpartition.str_start] = (sub_nt, subpartition.str_end) else: sub_nt = NonterminalLabel(subpartition.phrase, index) # assert partition_graph.is_connected() index += 1 partition_graph.roots = partition_graph.find_roots() partition_graph.roots.sort(lambda x, y: node_order[x] - node_order[y]) # Assemble String rule str_rhs = [] i = partition.str_start while i <= partition.str_end: if i in spans_to_nt: new_nt, i = spans_to_nt[i] str_rhs.append(new_nt) else: str_rhs.append(leaves[i]) i = i + 1 rule = Rule(0, nt.label, partition_graph, tuple(str_rhs), 1) rule_id = self.add_rule(rule) poss.append((rule_id, children)) fragment = fragment_counter[0] result[fragment] = poss result.use_counts[fragment] += 1 seen[partition] = fragment fragment_counter[0] += 1 return fragment
def get_graph(self, graph): trips = graph.triples() return Hgraph.from_triples([trips[i] for i in range(len(trips)) if self.edges[i] == 1], {}, warn=False)
log.info("Loaded %s%s grammar with %i rules."\ % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar))) # EM training if config.train: iterations = config.train if not config.input_file: log.err("Please specify corpus file for EM training.") sys.exit(1) if config.bitext: corpus = list(read_pairs(fileinput.input(config.input_file))) grammar.em(corpus, iterations, parser_class, "synchronous") else: corpus = [Hgraph.from_string(x) for x in fileinput.input(config.input_file)] grammar.em(corpus, iterations, parser_class, "forward") for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n") sys.exit(0) # Normalization if config.normalize: if config.bitext or grammar.rhs2_type is None or config.g or (config.k and not config.input_files): grammar.normalize_lhs() else: grammar.normalize_rhs2() for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n")
log.info("Loaded %s%s grammar with %i rules."\ % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar))) # EM training if config.train: iterations = config.train if not config.input_file: log.err("Please specify corpus file for EM training.") sys.exit(1) if config.bitext: corpus = list(read_pairs(fileinput.input(config.input_file))) grammar.em(corpus, iterations, parser_class, "synchronous") else: corpus = [ Hgraph.from_string(x) for x in fileinput.input(config.input_file) ] grammar.em(corpus, iterations, parser_class, "forward") for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n") sys.exit(0) # Normalization if config.normalize: if config.bitext or grammar.rhs2_type is None or config.g or ( config.k and not config.input_files): grammar.normalize_lhs() else: grammar.normalize_rhs2()
def load_from_file(cls, in_file, rule_class=VoRule, reverse=False, nodelabels=False, logprob=False): """ Loads a SHRG grammar from the given file. See documentation for format details. rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph visit order (also used for strings). TdRule computes a tree decomposition on the first RHS when initialized. """ output = Grammar(nodelabels=nodelabels, logprob=logprob) rule_count = 1 line_count = 0 is_synchronous = False rhs1_type = None rhs2_type = None buf = StringIO.StringIO() for line in in_file: line_count += 1 l = line.strip() if l: if "#" in l: content, comment = l.split("#", 1) else: content = l buf.write(content.strip()) if ";" in content: rulestring = buf.getvalue() try: content, weights = rulestring.split(";", 1) weight = 0.0 if not weights else (float( weights) if logprob else math.log(float(weights))) except: raise GrammarError, \ "Line %i, Rule %i: Error near end of line." % (line_count, rule_count) try: lhs, rhsstring = content.split("->") except: raise GrammarError, \ "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count) lhs = lhs.strip() if rule_count == 1: output.start_symbol = lhs if "|" in rhsstring: if not is_synchronous and rule_count > 1: raise GrammarError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = True try: rhs1, rhs2 = rhsstring.split("|") except: raise GrammarError, "Only up to two RHSs are allowed in grammar file." else: if is_synchronous and rule_count > 0: raise ParserError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = False rhs1 = rhsstring rhs2 = None try: # If the first graph in the file cannot be parsed, assume it's a string r1 = Hgraph.from_string(rhs1) r1_nts = set([ (ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges() ]) if not rhs1_type: rhs1_type = GRAPH_FORMAT except (ParserError, IndexError), e: if rhs1_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r1 = parse_string(rhs1) nts = [ t for t in r1 if isinstance(t, NonterminalLabel) ] r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs1_type = STRING_FORMAT if is_synchronous: try: # If the first graph in the file cannot be parsed, assume it's a string if rhs2_type: assert rhs2_type == GRAPH_FORMAT r2 = Hgraph.from_string(rhs2) r2_nts = set([ (ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges() ]) if not rhs2_type: rhs2_type = GRAPH_FORMAT except (ParserError, IndexError, AssertionError), e: if rhs2_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r2 = parse_string(rhs2) nts = [ t for t in r2 if isinstance(t, NonterminalLabel) ] r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs2_type = STRING_FORMAT # Verify that nonterminals match up if not r1_nts == r2_nts: raise GrammarError, \ "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts)) else: r2 = None try: if is_synchronous and reverse: output[rule_count] = rule_class( rule_count, lhs, weight, r2, r1, nodelabels=nodelabels, logprob=logprob) else: output[rule_count] = rule_class( rule_count, lhs, weight, r1, r2, nodelabels=nodelabels, logprob=logprob) except Exception, e: raise GrammarError, \ "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message) buf = StringIO.StringIO() rule_count += 1