def parse_bitexts(self, pair_iterator): """ Parse all pairs of input objects returned by the pair iterator. This is a generator. """ for line1, line2 in pair_iterator: if self.grammar.rhs1_type == "hypergraph": obj1 = Hgraph.from_string(line1) else: obj1 = line1.strip().split() if self.grammar.rhs2_type == "hypergraph": obj2 = Hgraph.from_string(line2) else: obj2 = line2.strip().split() raw_chart = self.parse_bitext(obj1, obj2) yield cky_chart(raw_chart)
def test(): tree = FancyTree(""" (S (NP (DT The) (NN boy)) (VP (VBZ wants) (NP (DT the) (NN girl) (S (VP (TO to) (VP (VB believe) (NP (PRP him))))))) (. .))""") graph = Hgraph.from_string( "(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))") graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]} graph = graph.to_instance_edges()
def test(): tree = FancyTree( """ (S (NP (DT The) (NN boy)) (VP (VBZ wants) (NP (DT the) (NN girl) (S (VP (TO to) (VP (VB believe) (NP (PRP him))))))) (. .))""" ) graph = Hgraph.from_string("(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))") graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]} graph = graph.to_instance_edges()
log.info("Loaded %s%s grammar with %i rules."\ % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar))) # EM training if config.train: iterations = config.train if not config.input_file: log.err("Please specify corpus file for EM training.") sys.exit(1) if config.bitext: corpus = list(read_pairs(fileinput.input(config.input_file))) grammar.em(corpus, iterations, parser_class, "synchronous") else: corpus = [ Hgraph.from_string(x) for x in fileinput.input(config.input_file) ] grammar.em(corpus, iterations, parser_class, "forward") for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n") sys.exit(0) # Normalization if config.normalize: if config.bitext or grammar.rhs2_type is None or config.g or ( config.k and not config.input_files): grammar.normalize_lhs() else: grammar.normalize_rhs2()
def tree_decomposition_edge(graph_edge, visited, amr, nodelabels=False): visited.add(graph_edge) tree_node = TreeNode() if nodelabels: head = graph_edge[0][0] if graph_edge[2]: nodes, labels = zip(*graph_edge[2]) else: nodes = () else: head = graph_edge[0] nodes = graph_edge[2] tree_node.graph_nodes.add(head) tree_node.graph_nodes |= set(nodes) tree_node.graph_edge = graph_edge tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels=nodelabels) return tree_node if __name__ == "__main__": from common.hgraph.hgraph import Hgraph graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))") td = tree_decomposition(graph)
subtrees.append(tree_node) return subtrees[0] def tree_decomposition_edge(graph_edge, visited, amr, nodelabels = False): visited.add(graph_edge) tree_node = TreeNode() if nodelabels: head = graph_edge[0][0] if graph_edge[2]: nodes, labels = zip(*graph_edge[2]) else: nodes = () else: head = graph_edge[0] nodes = graph_edge[2] tree_node.graph_nodes.add(head) tree_node.graph_nodes |= set(nodes) tree_node.graph_edge = graph_edge tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels = nodelabels) return tree_node if __name__ == "__main__": from common.hgraph.hgraph import Hgraph graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))") td = tree_decomposition(graph)
log.info("Loaded %s%s grammar with %i rules."\ % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar))) # EM training if config.train: iterations = config.train if not config.input_file: log.err("Please specify corpus file for EM training.") sys.exit(1) if config.bitext: corpus = list(read_pairs(fileinput.input(config.input_file))) grammar.em(corpus, iterations, parser_class, "synchronous") else: corpus = [Hgraph.from_string(x) for x in fileinput.input(config.input_file)] grammar.em(corpus, iterations, parser_class, "forward") for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n") sys.exit(0) # Normalization if config.normalize: if config.bitext or grammar.rhs2_type is None or config.g or (config.k and not config.input_files): grammar.normalize_lhs() else: grammar.normalize_rhs2() for rid in sorted(grammar.keys()): output_file.write(str(grammar[rid])) output_file.write("\n")
def main(): graphs = set([line.strip().split('\t')[0] for line in file(sys.argv[1])]) for i, graph in enumerate(graphs): g = Hgraph.from_string(graph) g.render_to_file("{0}_{1}.jpg".format(sys.argv[2], i))
def load_from_file(cls, in_file, rule_class = VoRule, reverse = False, nodelabels = False, logprob = False): """ Loads a SHRG grammar from the given file. See documentation for format details. rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph visit order (also used for strings). TdRule computes a tree decomposition on the first RHS when initialized. """ output = Grammar(nodelabels = nodelabels, logprob = logprob) rule_count = 1 line_count = 0 is_synchronous = False rhs1_type = None rhs2_type = None buf = StringIO.StringIO() for line in in_file: line_count += 1 l = line.strip() if l: if "#" in l: content, comment = l.split("#",1) else: content = l buf.write(content.strip()) if ";" in content: rulestring = buf.getvalue() try: content, weights = rulestring.split(";",1) weight = 0.0 if not weights else (float(weights) if logprob else math.log(float(weights))) except: raise GrammarError, \ "Line %i, Rule %i: Error near end of line." % (line_count, rule_count) try: lhs, rhsstring = content.split("->") except: raise GrammarError, \ "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count) lhs = lhs.strip() if rule_count == 1: output.start_symbol = lhs if "|" in rhsstring: if not is_synchronous and rule_count > 1: raise GrammarError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = True try: rhs1,rhs2 = rhsstring.split("|") except: raise GrammarError,"Only up to two RHSs are allowed in grammar file." else: if is_synchronous and rule_count > 0: raise ParserError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = False rhs1 = rhsstring rhs2 = None try: # If the first graph in the file cannot be parsed, assume it's a string r1 = Hgraph.from_string(rhs1) r1_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges()]) if not rhs1_type: rhs1_type = GRAPH_FORMAT except (ParserError, IndexError), e: if rhs1_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r1 = parse_string(rhs1) nts = [t for t in r1 if isinstance(t, NonterminalLabel)] r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs1_type = STRING_FORMAT if is_synchronous: try: # If the first graph in the file cannot be parsed, assume it's a string if rhs2_type: assert rhs2_type == GRAPH_FORMAT r2 = Hgraph.from_string(rhs2) r2_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges()]) if not rhs2_type: rhs2_type = GRAPH_FORMAT except (ParserError, IndexError, AssertionError), e: if rhs2_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r2 = parse_string(rhs2) nts = [t for t in r2 if isinstance(t, NonterminalLabel)] r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs2_type = STRING_FORMAT # Verify that nonterminals match up if not r1_nts == r2_nts: raise GrammarError, \ "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts)) else: r2 = None try: if is_synchronous and reverse: output[rule_count] = rule_class(rule_count, lhs, weight, r2, r1, nodelabels = nodelabels, logprob = logprob) else: output[rule_count] = rule_class(rule_count, lhs, weight, r1, r2, nodelabels = nodelabels, logprob = logprob) except Exception, e: raise GrammarError, \ "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message) buf = StringIO.StringIO() rule_count += 1
def load_from_file(cls, in_file, rule_class=VoRule, reverse=False, nodelabels=False, logprob=False): """ Loads a SHRG grammar from the given file. See documentation for format details. rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph visit order (also used for strings). TdRule computes a tree decomposition on the first RHS when initialized. """ output = Grammar(nodelabels=nodelabels, logprob=logprob) rule_count = 1 line_count = 0 is_synchronous = False rhs1_type = None rhs2_type = None buf = StringIO.StringIO() for line in in_file: line_count += 1 l = line.strip() if l: if "#" in l: content, comment = l.split("#", 1) else: content = l buf.write(content.strip()) if ";" in content: rulestring = buf.getvalue() try: content, weights = rulestring.split(";", 1) weight = 0.0 if not weights else (float( weights) if logprob else math.log(float(weights))) except: raise GrammarError, \ "Line %i, Rule %i: Error near end of line." % (line_count, rule_count) try: lhs, rhsstring = content.split("->") except: raise GrammarError, \ "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count) lhs = lhs.strip() if rule_count == 1: output.start_symbol = lhs if "|" in rhsstring: if not is_synchronous and rule_count > 1: raise GrammarError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = True try: rhs1, rhs2 = rhsstring.split("|") except: raise GrammarError, "Only up to two RHSs are allowed in grammar file." else: if is_synchronous and rule_count > 0: raise ParserError,\ "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count) is_synchronous = False rhs1 = rhsstring rhs2 = None try: # If the first graph in the file cannot be parsed, assume it's a string r1 = Hgraph.from_string(rhs1) r1_nts = set([ (ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges() ]) if not rhs1_type: rhs1_type = GRAPH_FORMAT except (ParserError, IndexError), e: if rhs1_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r1 = parse_string(rhs1) nts = [ t for t in r1 if isinstance(t, NonterminalLabel) ] r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs1_type = STRING_FORMAT if is_synchronous: try: # If the first graph in the file cannot be parsed, assume it's a string if rhs2_type: assert rhs2_type == GRAPH_FORMAT r2 = Hgraph.from_string(rhs2) r2_nts = set([ (ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges() ]) if not rhs2_type: rhs2_type = GRAPH_FORMAT except (ParserError, IndexError, AssertionError), e: if rhs2_type == GRAPH_FORMAT: raise ParserError,\ "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message) else: r2 = parse_string(rhs2) nts = [ t for t in r2 if isinstance(t, NonterminalLabel) ] r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts]) rhs2_type = STRING_FORMAT # Verify that nonterminals match up if not r1_nts == r2_nts: raise GrammarError, \ "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts)) else: r2 = None try: if is_synchronous and reverse: output[rule_count] = rule_class( rule_count, lhs, weight, r2, r1, nodelabels=nodelabels, logprob=logprob) else: output[rule_count] = rule_class( rule_count, lhs, weight, r1, r2, nodelabels=nodelabels, logprob=logprob) except Exception, e: raise GrammarError, \ "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message) buf = StringIO.StringIO() rule_count += 1