Ejemplo n.º 1
0
    def test_parse_1(self, use_trie):
        # NT -> BAR
        rule_1 = cfg_rule.CFGRule(idx=0,
                                  lhs=NT,
                                  rhs=(cfg_rule.CFGSymbol(
                                      BAR, cfg_rule.TERMINAL), ))

        # NT -> FOO NT
        rule_2 = cfg_rule.CFGRule(
            idx=1,
            lhs=NT,
            rhs=(
                cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
                cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
            ))

        input_ids = [FOO, FOO, BAR]

        parses = cfg_parser.parse(input_ids, [rule_1, rule_2], {NT},
                                  NT,
                                  _populate_fn,
                                  _postprocess_fn,
                                  use_trie=use_trie,
                                  verbose=True)
        self.assertLen(parses, 1)
        parse_node = parses[0]
        self.assertEqual(parse_node, [(0, 3, 1), (1, 3, 1), (2, 3, 0)])
Ejemplo n.º 2
0
    def test_parse_2(self, use_trie):
        # NT -> BAR
        rule_1 = cfg_rule.CFGRule(idx=0,
                                  lhs=NT,
                                  rhs=(cfg_rule.CFGSymbol(
                                      BAR, cfg_rule.TERMINAL), ))

        # NT -> NT FOO NT
        rule_2 = cfg_rule.CFGRule(
            idx=1,
            lhs=NT,
            rhs=(
                cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
                cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
                cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
            ))

        # NT -> NT FOO BAR
        rule_3 = cfg_rule.CFGRule(
            idx=2,
            lhs=NT,
            rhs=(
                cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
                cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
                cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
            ))

        input_ids = [BAR, FOO, BAR]

        parses = cfg_parser.parse(input_ids, [rule_1, rule_2, rule_3], {NT},
                                  NT,
                                  _populate_fn,
                                  _postprocess_fn,
                                  use_trie=use_trie,
                                  verbose=True)
        self.assertLen(parses, 2)
        self.assertEqual(parses, [[(0, 3, 2),
                                   (0, 1, 0)], [(0, 3, 1), (0, 1, 0),
                                                (2, 3, 0)]])
Ejemplo n.º 3
0
def _run_parser(tokens, rules, verbose):
    """Run bottom up parser."""
    # Expand and eliminate unit rules.
    rules = expand_unit_rules(rules)

    # Convert tokens to integer IDs.
    terminals_to_ids = {}
    for idx, token in enumerate(set(tokens)):
        terminals_to_ids[token] = idx
    input_ids = [terminals_to_ids[token] for token in tokens]

    # Generate non-terminal IDs.
    nonterminals_to_ids = {}
    nt_idx = 0
    for rule in rules:
        if rule.lhs not in nonterminals_to_ids:
            nonterminals_to_ids[rule.lhs] = nt_idx
            nt_idx += 1
    nonterminals = nonterminals_to_ids.values()
    start_idx = nonterminals_to_ids["ROOT"]

    # Convert to ParserRule format.
    idx_to_rule = {}
    parser_rules = []
    rule_idx = 0
    for rule in rules:
        rhs = _convert_to_rhs(rule, terminals_to_ids, nonterminals_to_ids)
        if rhs is None:
            continue
        lhs = nonterminals_to_ids[rule.lhs]
        parser_rule = cfg_rule.CFGRule(idx=rule_idx, lhs=lhs, rhs=rhs)
        parser_rules.append(parser_rule)
        idx_to_rule[rule_idx] = rule
        rule_idx += 1

    populate_fn = _get_populate_fn(idx_to_rule)
    parses = cfg_parser.parse(input_ids,
                              parser_rules,
                              nonterminals,
                              start_idx,
                              populate_fn=populate_fn,
                              postprocess_fn=_postprocess_fn,
                              verbose=verbose)

    return parses
Ejemplo n.º 4
0
def _convert_to_parser_rule(rule, terminals_to_ids, nonterminals_to_ids,
                            rule_idx):
    """Convert Rule to CFGRule."""
    rhs = []
    for token in rule.rhs.split(" "):
        if token.startswith(NON_TERMINAL_PREFIX):
            symbol_idx = nonterminals_to_ids[token[len(NON_TERMINAL_PREFIX):]]
            rhs.append(
                cfg_rule.CFGSymbol(idx=symbol_idx, type=cfg_rule.NON_TERMINAL))
        else:
            if token not in terminals_to_ids:
                return None
            symbol_idx = terminals_to_ids[token]
            rhs.append(
                cfg_rule.CFGSymbol(idx=symbol_idx, type=cfg_rule.TERMINAL))
    lhs = nonterminals_to_ids[rule.lhs]
    parser_rule = cfg_rule.CFGRule(idx=rule_idx, lhs=lhs, rhs=rhs)
    return parser_rule
Ejemplo n.º 5
0
def parse(tokens, rules, node_fn, postprocess_cell_fn, verbose=False):
    """Run bottom up parser.

  Args:
    tokens: List of strings for input.
    rules: List of QCFGRule instances.
    node_fn: Function with input arguments (span_begin, span_end, rule,
      children) and returns a "node".
    postprocess_cell_fn: Function from a list of "nodes" to "nodes".
    verbose: Print debug output if True.

  Returns:
    A List of "node" objects for completed parses.
  """
    if verbose:
        print("tokens: %s" % (tokens, ))
        print("rules:")
        for rule in rules:
            print(str(rule))

    # Convert tokens to integer IDs.
    tokens_to_input_ids = {}
    input_ids_to_tokens = {}
    for idx, token in enumerate(set(tokens)):
        input_ids_to_tokens[idx] = token
        tokens_to_input_ids[token] = idx
    input_ids = [tokens_to_input_ids[token] for token in tokens]

    # Our QCFG grammars always use a single NT symbol.
    nt_idx = 0

    # Convert to ParserRule format.
    idx_to_rule = {}
    parser_rules = []
    rule_idx = 0
    for rule in rules:
        rhs = _convert_rhs(rule, nt_idx, tokens_to_input_ids)
        if rhs is None:
            continue
        parser_rule = cfg_rule.CFGRule(idx=rule_idx, lhs=nt_idx, rhs=rhs)
        parser_rules.append(parser_rule)
        idx_to_rule[rule_idx] = rule
        rule_idx += 1

    # Wrap node_fn to pass original Rule instead of CFGRule.
    def populate_fn(span_begin, span_end, parser_rule, children):
        rule = idx_to_rule[parser_rule.idx]
        return node_fn(span_begin, span_end, rule, children)

    nonterminals = {nt_idx}
    start_idx = nt_idx

    if verbose:
        print("parser_rules: %s" % parser_rules)

    parses = cfg_parser.parse(input_ids,
                              parser_rules,
                              nonterminals,
                              start_idx,
                              populate_fn,
                              postprocess_cell_fn,
                              verbose=verbose)

    return parses