예제 #1
0
파일: parser.py 프로젝트: rupenp/bolinas
    def parse_bitexts(self, pair_iterator):
        """
      Parse all pairs of input objects returned by the pair iterator. 
      This is a generator.
      """
        for line1, line2 in pair_iterator:
            if self.grammar.rhs1_type == "hypergraph":
                obj1 = Hgraph.from_string(line1)
            else:
                obj1 = line1.strip().split()

            if self.grammar.rhs2_type == "hypergraph":
                obj2 = Hgraph.from_string(line2)
            else:
                obj2 = line2.strip().split()

            raw_chart = self.parse_bitext(obj1, obj2)
            yield cky_chart(raw_chart)
예제 #2
0
    def parse_bitexts(self, pair_iterator):
        """
      Parse all pairs of input objects returned by the pair iterator. 
      This is a generator.
      """
        for line1, line2 in pair_iterator:
            if self.grammar.rhs1_type == "hypergraph":
                obj1 = Hgraph.from_string(line1)
            else:
                obj1 = line1.strip().split()

            if self.grammar.rhs2_type == "hypergraph":
                obj2 = Hgraph.from_string(line2)
            else:
                obj2 = line2.strip().split()

            raw_chart = self.parse_bitext(obj1, obj2)
            yield cky_chart(raw_chart)
예제 #3
0
def test():
    tree = FancyTree("""
    (S
        (NP (DT The) (NN boy))
        (VP (VBZ wants)
          (NP (DT the) (NN girl)
            (S
              (VP (TO to)
                (VP (VB believe)
                  (NP (PRP him)))))))
        (. .))""")
    graph = Hgraph.from_string(
        "(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))")
    graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]}
    graph = graph.to_instance_edges()
예제 #4
0
def test():
    tree = FancyTree(
        """
    (S
        (NP (DT The) (NN boy))
        (VP (VBZ wants)
          (NP (DT the) (NN girl)
            (S
              (VP (TO to)
                (VP (VB believe)
                  (NP (PRP him)))))))
        (. .))"""
    )
    graph = Hgraph.from_string("(w.want :arg0 b.boy :arg1 (b2.believe :arg0 (g.girl) :arg1 b.))")
    graph.node_alignments = {"b": [1], "w": [2], "g": [4], "b2": [6]}
    graph = graph.to_instance_edges()
예제 #5
0
        log.info("Loaded %s%s grammar with %i rules."\
            % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar)))

        # EM training
        if config.train:
            iterations = config.train
            if not config.input_file:
                log.err("Please specify corpus file for EM training.")
                sys.exit(1)
            if config.bitext:
                corpus = list(read_pairs(fileinput.input(config.input_file)))
                grammar.em(corpus, iterations, parser_class, "synchronous")
            else:
                corpus = [
                    Hgraph.from_string(x)
                    for x in fileinput.input(config.input_file)
                ]
                grammar.em(corpus, iterations, parser_class, "forward")
            for rid in sorted(grammar.keys()):
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
            sys.exit(0)

        # Normalization
        if config.normalize:
            if config.bitext or grammar.rhs2_type is None or config.g or (
                    config.k and not config.input_files):
                grammar.normalize_lhs()
            else:
                grammar.normalize_rhs2()

def tree_decomposition_edge(graph_edge, visited, amr, nodelabels=False):
    visited.add(graph_edge)
    tree_node = TreeNode()
    if nodelabels:
        head = graph_edge[0][0]
        if graph_edge[2]:
            nodes, labels = zip(*graph_edge[2])
        else:
            nodes = ()
    else:
        head = graph_edge[0]
        nodes = graph_edge[2]

    tree_node.graph_nodes.add(head)
    tree_node.graph_nodes |= set(nodes)

    tree_node.graph_edge = graph_edge
    tree_node.first_child = tree_decomposition_node(nodes,
                                                    visited,
                                                    amr,
                                                    nodelabels=nodelabels)
    return tree_node


if __name__ == "__main__":
    from common.hgraph.hgraph import Hgraph
    graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))")
    td = tree_decomposition(graph)
예제 #7
0
    subtrees.append(tree_node)

  return subtrees[0]

def tree_decomposition_edge(graph_edge, visited, amr, nodelabels = False):
  visited.add(graph_edge)
  tree_node = TreeNode()
  if nodelabels: 
      head = graph_edge[0][0]
      if graph_edge[2]:
          nodes, labels = zip(*graph_edge[2])
      else: 
          nodes = () 
  else:
      head = graph_edge[0]
      nodes = graph_edge[2]

  tree_node.graph_nodes.add(head)
  tree_node.graph_nodes |= set(nodes)


  tree_node.graph_edge = graph_edge
  tree_node.first_child = tree_decomposition_node(nodes, visited, amr, nodelabels = nodelabels)
  return tree_node


if __name__ == "__main__":
    from common.hgraph.hgraph import Hgraph
    graph = Hgraph.from_string("(n :P$1 :arg0 (a.n :E$2) :arg1 (n :S$3 a.))")
    td = tree_decomposition(graph)
예제 #8
0
파일: bolinas.py 프로젝트: jimwhite/bolinas
        log.info("Loaded %s%s grammar with %i rules."\
            % (grammar.rhs1_type, "-to-%s" % grammar.rhs2_type if grammar.rhs2_type else '', len(grammar)))
 

        # EM training 
        if config.train:
            iterations = config.train
            if not config.input_file: 
                log.err("Please specify corpus file for EM training.")
                sys.exit(1)
            if config.bitext:
                corpus = list(read_pairs(fileinput.input(config.input_file)))
                grammar.em(corpus, iterations, parser_class, "synchronous")
            else: 
                corpus = [Hgraph.from_string(x) for x in fileinput.input(config.input_file)]
                grammar.em(corpus, iterations, parser_class, "forward")
            for rid in sorted(grammar.keys()): 
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
            sys.exit(0)

        # Normalization
        if config.normalize:
            if config.bitext or grammar.rhs2_type is None or config.g or (config.k and not config.input_files):
                grammar.normalize_lhs()
            else:
                grammar.normalize_rhs2()
            for rid in sorted(grammar.keys()): 
                output_file.write(str(grammar[rid]))
                output_file.write("\n")
예제 #9
0
def main():
    graphs = set([line.strip().split('\t')[0] for line in file(sys.argv[1])])
    for i, graph in enumerate(graphs):
        g = Hgraph.from_string(graph)
        g.render_to_file("{0}_{1}.jpg".format(sys.argv[2], i))
예제 #10
0
파일: grammar.py 프로젝트: ChenluJi/bolinas
    def load_from_file(cls, in_file, rule_class = VoRule, reverse = False, nodelabels = False, logprob = False):
        """
        Loads a SHRG grammar from the given file. 
        See documentation for format details.
        
        rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph
        visit order (also used for strings). TdRule computes a tree decomposition on the first RHS
        when initialized.
        """

        output = Grammar(nodelabels = nodelabels, logprob = logprob)

        rule_count = 1
        line_count = 0
        is_synchronous = False

        rhs1_type = None
        rhs2_type = None

        buf = StringIO.StringIO() 

        for line in in_file: 
            line_count += 1
            l = line.strip()
            if l:
                if "#" in l: 
                    content, comment = l.split("#",1)
                else: 
                    content = l
                buf.write(content.strip())
                if ";" in content:
                    rulestring = buf.getvalue()
                    try:
                        content, weights = rulestring.split(";",1)            
                        weight = 0.0 if not weights else (float(weights) if logprob else math.log(float(weights)))
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Error near end of line." % (line_count, rule_count)
                   
                    try:  
                        lhs, rhsstring = content.split("->")
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count)
                    lhs = lhs.strip()
                    if rule_count == 1:
                        output.start_symbol = lhs
                    if "|" in rhsstring:
                        if not is_synchronous and rule_count > 1:
                            raise GrammarError,\
           "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = True
                        try:
                            rhs1,rhs2 = rhsstring.split("|")
                        except:
                            raise GrammarError,"Only up to two RHSs are allowed in grammar file."
                    else: 
                        if is_synchronous and rule_count > 0:
                            raise ParserError,\
            "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = False
                        rhs1 = rhsstring
                        rhs2 = None                               
                    
                    try:    # If the first graph in the file cannot be parsed, assume it's a string
                        r1  = Hgraph.from_string(rhs1)
                        r1_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r1.nonterminal_edges()])
                        if not rhs1_type:
                            rhs1_type = GRAPH_FORMAT
                    except (ParserError, IndexError), e: 
                        if rhs1_type == GRAPH_FORMAT:
                           raise ParserError,\
            "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                        else:
                           r1 = parse_string(rhs1) 
                           nts = [t for t in r1 if isinstance(t, NonterminalLabel)]
                           r1_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts])
                           rhs1_type = STRING_FORMAT
  
                    if is_synchronous:
                        try:    # If the first graph in the file cannot be parsed, assume it's a string
                            if rhs2_type: 
                                assert rhs2_type == GRAPH_FORMAT
                            r2  = Hgraph.from_string(rhs2)
                            r2_nts = set([(ntlabel.label, ntlabel.index) for h, ntlabel, t in r2.nonterminal_edges()])
                            if not rhs2_type:
                                rhs2_type = GRAPH_FORMAT
                        except (ParserError, IndexError, AssertionError), e: 
                            if rhs2_type == GRAPH_FORMAT:
                               raise ParserError,\
                "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                            else:
                               r2 = parse_string(rhs2) 
                               nts = [t for t in r2 if isinstance(t, NonterminalLabel)]
                               r2_nts = set([(ntlabel.label, ntlabel.index) for ntlabel in nts])
                               rhs2_type = STRING_FORMAT

                        # Verify that nonterminals match up
                        if not r1_nts == r2_nts:
                            raise GrammarError, \
            "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts))
                    else: 
                        r2 = None
                    try:    
                        if is_synchronous and reverse: 
                            output[rule_count] = rule_class(rule_count, lhs, weight, r2, r1, nodelabels = nodelabels, logprob = logprob)                                     
                        else: 
                            output[rule_count] = rule_class(rule_count, lhs, weight, r1, r2, nodelabels = nodelabels, logprob = logprob) 
                    except Exception, e:         
                        raise GrammarError, \
            "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message)
                    buf = StringIO.StringIO() 
                    rule_count += 1
예제 #11
0
    def load_from_file(cls,
                       in_file,
                       rule_class=VoRule,
                       reverse=False,
                       nodelabels=False,
                       logprob=False):
        """
        Loads a SHRG grammar from the given file. 
        See documentation for format details.
        
        rule_class specifies the type of rule to use. VoRule is a subclass using an arbitrary graph
        visit order (also used for strings). TdRule computes a tree decomposition on the first RHS
        when initialized.
        """

        output = Grammar(nodelabels=nodelabels, logprob=logprob)

        rule_count = 1
        line_count = 0
        is_synchronous = False

        rhs1_type = None
        rhs2_type = None

        buf = StringIO.StringIO()

        for line in in_file:
            line_count += 1
            l = line.strip()
            if l:
                if "#" in l:
                    content, comment = l.split("#", 1)
                else:
                    content = l
                buf.write(content.strip())
                if ";" in content:
                    rulestring = buf.getvalue()
                    try:
                        content, weights = rulestring.split(";", 1)
                        weight = 0.0 if not weights else (float(
                            weights) if logprob else math.log(float(weights)))
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Error near end of line." % (line_count, rule_count)

                    try:
                        lhs, rhsstring = content.split("->")
                    except:
                        raise GrammarError, \
            "Line %i, Rule %i: Invalid rule format." % (line_count, rule_count)
                    lhs = lhs.strip()
                    if rule_count == 1:
                        output.start_symbol = lhs
                    if "|" in rhsstring:
                        if not is_synchronous and rule_count > 1:
                            raise GrammarError,\
           "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = True
                        try:
                            rhs1, rhs2 = rhsstring.split("|")
                        except:
                            raise GrammarError, "Only up to two RHSs are allowed in grammar file."
                    else:
                        if is_synchronous and rule_count > 0:
                            raise ParserError,\
            "Line %i, Rule %i: All or none of the rules need to have two RHSs." % (line_count, rule_count)
                        is_synchronous = False
                        rhs1 = rhsstring
                        rhs2 = None

                    try:  # If the first graph in the file cannot be parsed, assume it's a string
                        r1 = Hgraph.from_string(rhs1)
                        r1_nts = set([
                            (ntlabel.label, ntlabel.index)
                            for h, ntlabel, t in r1.nonterminal_edges()
                        ])
                        if not rhs1_type:
                            rhs1_type = GRAPH_FORMAT
                    except (ParserError, IndexError), e:
                        if rhs1_type == GRAPH_FORMAT:
                            raise ParserError,\
             "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                        else:
                            r1 = parse_string(rhs1)
                            nts = [
                                t for t in r1
                                if isinstance(t, NonterminalLabel)
                            ]
                            r1_nts = set([(ntlabel.label, ntlabel.index)
                                          for ntlabel in nts])
                            rhs1_type = STRING_FORMAT

                    if is_synchronous:
                        try:  # If the first graph in the file cannot be parsed, assume it's a string
                            if rhs2_type:
                                assert rhs2_type == GRAPH_FORMAT
                            r2 = Hgraph.from_string(rhs2)
                            r2_nts = set([
                                (ntlabel.label, ntlabel.index)
                                for h, ntlabel, t in r2.nonterminal_edges()
                            ])
                            if not rhs2_type:
                                rhs2_type = GRAPH_FORMAT
                        except (ParserError, IndexError, AssertionError), e:
                            if rhs2_type == GRAPH_FORMAT:
                                raise ParserError,\
                 "Line %i, Rule %i: Could not parse graph description: %s" % (line_count, rule_count, e.message)
                            else:
                                r2 = parse_string(rhs2)
                                nts = [
                                    t for t in r2
                                    if isinstance(t, NonterminalLabel)
                                ]
                                r2_nts = set([(ntlabel.label, ntlabel.index)
                                              for ntlabel in nts])
                                rhs2_type = STRING_FORMAT

                        # Verify that nonterminals match up
                        if not r1_nts == r2_nts:
                            raise GrammarError, \
            "Line %i, Rule %i: Nonterminals do not match between RHSs: %s %s" % (line_count, rule_count, str(r1_nts), str(r2_nts))
                    else:
                        r2 = None
                    try:
                        if is_synchronous and reverse:
                            output[rule_count] = rule_class(
                                rule_count,
                                lhs,
                                weight,
                                r2,
                                r1,
                                nodelabels=nodelabels,
                                logprob=logprob)
                        else:
                            output[rule_count] = rule_class(
                                rule_count,
                                lhs,
                                weight,
                                r1,
                                r2,
                                nodelabels=nodelabels,
                                logprob=logprob)
                    except Exception, e:
                        raise GrammarError, \
            "Line %i, Rule %i: Could not initialize rule. %s" % (line_count, rule_count, e.message)
                    buf = StringIO.StringIO()
                    rule_count += 1