def test_parse_3(self):
    # NT -> FOO NT
    rule_1 = cfg_rule.CFGRule(
        idx=0,
        lhs=NT,
        rhs=(
            cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
            cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
        ))

    input_symbols = [
        cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
        cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
        cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
    ]

    parses = cfg_parser.parse_symbols(
        input_symbols, [rule_1], {NT},
        {NT},
        _populate_fn,
        _postprocess_fn,
        verbose=True)
    self.assertLen(parses, 1)
    parse_node = parses[0]
    self.assertEqual(parse_node, [(0, 3, 0), (1, 3, 0)])
  def test_parse_6(self):
    # NT -> NT_2 BAR
    rule_1 = cfg_rule.CFGRule(
        idx=0,
        lhs=NT,
        rhs=(
            cfg_rule.CFGSymbol(NT_2, cfg_rule.NON_TERMINAL),
            cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
        ))

    # NT_2 -> NT BAR
    rule_2 = cfg_rule.CFGRule(
        idx=1,
        lhs=NT_2,
        rhs=(
            cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
            cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
        ))

    input_symbols = [
        cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
        cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
    ]
    parses = cfg_parser.parse_symbols(
        input_symbols, [rule_1, rule_2], {NT, NT_2},
        {NT, NT_2},
        _populate_fn,
        _postprocess_fn,
        verbose=True)
    self.assertLen(parses, 1)
    parse_node = parses[0]
    self.assertEqual(parse_node, [(0, 2, 1)])
    def can_parse(self, tokens, verbose=False):
        """Return True if can be parsed given target CFG."""
        input_symbols = []
        terminal_ids = set()
        for token in tokens:
            if qcfg_rule.is_nt(token):
                idx = self.converter.nonterminals_to_ids[PLACEHOLDER_NT]
                input_symbols.append(
                    cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL))
            else:
                if token not in self.converter.terminals_to_ids:
                    if verbose:
                        print(
                            "token `%s` not in `converter.terminals_to_ids`: %s"
                            % (token, self.converter.terminals_to_ids))
                    return False
                idx = self.converter.terminals_to_ids[token]
                terminal_ids.add(idx)
                input_symbols.append(cfg_rule.CFGSymbol(
                    idx, cfg_rule.TERMINAL))

        # Filter rules that contain terminals not in the input.
        def should_include(parser_rule):
            for symbol in parser_rule.rhs:
                if symbol.type == cfg_rule.TERMINAL and symbol.idx not in terminal_ids:
                    return False
            return True

        filtered_rules = [
            rule for rule in self.parser_rules if should_include(rule)
        ]
        if verbose:
            print("filtered_rules:")
            for rule in filtered_rules:
                print(rule)

        def populate_fn(unused_span_begin, unused_span_end, unused_parser_rule,
                        unused_children):
            return [True]

        nonterminals = set(self.converter.nonterminals_to_ids.values())
        parses = cfg_parser.parse_symbols(input_symbols,
                                          filtered_rules,
                                          nonterminals,
                                          nonterminals,
                                          populate_fn,
                                          postprocess_fn=None,
                                          max_single_nt_applications=2,
                                          verbose=verbose)
        if parses:
            return True
        else:
            return False
Exemple #4
0
def can_parse(target_string,
              rules,
              max_single_nt_applications=2,
              verbose=False):
  """Returns True if there exists >=1 parse of target_string given rules."""
  tokens = target_string.split(" ")

  # Convert rules.
  converter = cfg_converter.CFGRuleConverter()
  parser_rules = []
  for rule_idx, rule in enumerate(rules):
    parser_rule = converter.convert_to_cfg_rule(
        lhs=rule.lhs,
        rhs=rule.rhs.split(" "),
        rule_idx=rule_idx,
        nonterminal_prefix=NON_TERMINAL_PREFIX,
        allowed_terminals=set(tokens))
    if parser_rule:
      parser_rules.append(parser_rule)

  start_idx = converter.nonterminals_to_ids[ROOT_SYMBOL]
  nonterminals = converter.nonterminals_to_ids.values()

  input_symbols = []
  for token in tokens:
    if token.startswith(NON_TERMINAL_PREFIX):
      idx = converter.nonterminals_to_ids[token[len(NON_TERMINAL_PREFIX):]]
      input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL))
    else:
      if token not in converter.terminals_to_ids:
        return False
      idx = converter.terminals_to_ids[token]
      input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL))

  # Run parser.
  parses = cfg_parser.parse_symbols(
      input_symbols,
      parser_rules,
      nonterminals, {start_idx},
      _populate_fn,
      _postprocess_fn,
      verbose=verbose,
      max_single_nt_applications=max_single_nt_applications)

  if parses:
    return True
  else:
    return False
  def test_parse_4(self):
    # NT -> BAR
    rule_1 = cfg_rule.CFGRule(
        idx=0, lhs=NT, rhs=(cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),))

    # NT -> NT FOO NT
    rule_2 = cfg_rule.CFGRule(
        idx=1,
        lhs=NT,
        rhs=(
            cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
            cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
            cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
        ))

    # NT -> NT FOO BAR
    rule_3 = cfg_rule.CFGRule(
        idx=2,
        lhs=NT,
        rhs=(
            cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
            cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
            cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
        ))

    input_symbols = [
        cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL),
        cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL),
        cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),
    ]

    parses = cfg_parser.parse_symbols(
        input_symbols, [rule_1, rule_2, rule_3], {NT},
        {NT},
        _populate_fn,
        _postprocess_fn,
        verbose=True)
    self.assertLen(parses, 2)
    self.assertEqual(parses, [[(0, 3, 2)], [(0, 3, 1), (2, 3, 0)]])
def parse(tokens,
          rules,
          node_fn,
          postprocess_cell_fn,
          max_single_nt_applications=1,
          verbose=False):
    """Run bottom up parser.

  Args:
    tokens: List of strings for input (terminals or nonterminals).
    rules: List of QCFGRule instances.
    node_fn: Function with input arguments (span_begin, span_end, rule,
      children) and returns a "node".
    postprocess_cell_fn: Function from a list of "nodes" to "nodes".
    max_single_nt_applications: The maximum number of times a rule where the RHS
      is a single nonterminal symbol can be applied consecutively.
    verbose: Print debug output if True.

  Returns:
    A List of "node" objects for completed parses.
  """
    if verbose:
        print("tokens: %s" % (tokens, ))
        print("rules:")
        for rule in rules:
            print(str(rule))

    # Our QCFG grammars always use a single NT symbol.
    nt_idx = 0

    # Convert to ParserRule format.
    converter = cfg_converter.CFGRuleConverter()
    idx_to_rule = {}
    parser_rules = []
    rule_idx = 0

    allowed_terminals = set(tokens)
    for rule in rules:
        if not qcfg_rule.is_allowed(rule.source, allowed_terminals):
            continue
        rhs = _convert_nt(rule.source)
        parser_rule = converter.convert_to_cfg_rule(
            lhs=NT_IDX,
            rhs=rhs,
            rule_idx=rule_idx,
            nonterminal_prefix=NON_TERMINAL_PREFIX)
        parser_rules.append(parser_rule)
        idx_to_rule[rule_idx] = rule
        rule_idx += 1

    for token in tokens:
        if not qcfg_rule.is_nt(
                token) and token not in converter.terminals_to_ids:
            if verbose:
                print("Input token does not appear in rules: %s" % token)
            return []

    input_symbols = []
    for token in tokens:
        if qcfg_rule.is_nt(token):
            input_symbols.append(
                cfg_rule.CFGSymbol(nt_idx, cfg_rule.NON_TERMINAL))
        else:
            idx = converter.terminals_to_ids[token]
            input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL))

    # Wrap node_fn to pass original Rule instead of CFGRule.
    def populate_fn(span_begin, span_end, parser_rule, children):
        rule = idx_to_rule[parser_rule.idx]
        node = node_fn(span_begin, span_end, rule, children)
        return [node]

    nonterminals = {nt_idx}
    start_idx = nt_idx

    if verbose:
        print("parser_rules: %s" % parser_rules)

    parses = cfg_parser.parse_symbols(
        input_symbols,
        parser_rules,
        nonterminals, {start_idx},
        populate_fn,
        postprocess_cell_fn,
        max_single_nt_applications=max_single_nt_applications,
        verbose=verbose)

    return parses
def parse(tokens, rules, node_fn, postprocess_fn, verbose=False):
    """Run bottom up parser on QCFG target using target CFG.

  Args:
    tokens: List of strings for input.
    rules: List of TargetCfgRule instances.
    node_fn: Function with input arguments (span_begin, span_end, rule,
      children) and returns a list of "node".
    postprocess_fn: Function from a list of "nodes" to "nodes".
    verbose: Print debug output if True.

  Returns:
    A List of "node" objects for completed parses.
  """
    if verbose:
        print("tokens: %s" % (tokens, ))
        print("rules:")
        for rule in rules:
            print(str(rule))
    terminals = [
        token for token in tokens
        if not token.startswith(qcfg_rule.NON_TERMINAL_PREFIX)
    ]

    # Convert rules.
    converter = cfg_converter.CFGRuleConverter()
    parser_rules = []
    idx_to_rule = {}
    rule_idx = 0
    for rule in rules:
        parser_rule = converter.convert_to_cfg_rule(
            lhs=rule.lhs,
            rhs=rule.rhs.split(" "),
            rule_idx=rule_idx,
            nonterminal_prefix=target_grammar.NON_TERMINAL_PREFIX,
            allowed_terminals=set(terminals))
        if parser_rule:
            parser_rules.append(parser_rule)
            idx_to_rule[rule_idx] = rule
            rule_idx += 1

    # Add rules for every target nonterminal and QCFG nonterminal
    target_nts = set(converter.nonterminals_to_ids.keys())
    qcfg_nts = set(qcfg_rule.get_nts(tokens))
    for target_nt in target_nts:
        for qcfg_nt in qcfg_nts:
            rule = target_grammar.TargetCfgRule(target_nt,
                                                _convert_qcfg_nt(qcfg_nt))
            parser_rule = converter.convert_to_cfg_rule(
                lhs=rule.lhs,
                rhs=rule.rhs.split(" "),
                rule_idx=rule_idx,
                nonterminal_prefix=target_grammar.NON_TERMINAL_PREFIX)
            parser_rules.append(parser_rule)
            idx_to_rule[rule_idx] = rule
            rule_idx += 1

    input_symbols = []
    for token in tokens:
        if qcfg_rule.is_nt(token):
            if token not in converter.nonterminals_to_ids:
                return []
            idx = converter.nonterminals_to_ids[token]
            input_symbols.append(cfg_rule.CFGSymbol(idx,
                                                    cfg_rule.NON_TERMINAL))
        else:
            if token not in converter.terminals_to_ids:
                return []
            idx = converter.terminals_to_ids[token]
            input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL))

    # Wrap node_fn to pass original Rule instead of CFGRule.
    def populate_fn(span_begin, span_end, parser_rule, children):
        rule = idx_to_rule[parser_rule.idx]
        nodes = node_fn(span_begin, span_end, rule, children)
        return nodes

    nonterminals = set(converter.nonterminals_to_ids.values())
    if verbose:
        print("parser_rules: %s" % parser_rules)

    parses = cfg_parser.parse_symbols(input_symbols,
                                      parser_rules,
                                      nonterminals,
                                      nonterminals,
                                      populate_fn,
                                      postprocess_fn,
                                      max_single_nt_applications=0,
                                      verbose=verbose)
    return parses
    def convert(self, induced_rule, verbose=False):
        """Convert QCFGRule to JointRule."""
        tokens = induced_rule.target
        input_symbols = []
        terminal_ids = set()
        qcfg_idxs = []
        rhs = []
        num_nts = 0
        for token in tokens:
            if qcfg_rule.is_nt(token):
                qcfg_idx = qcfg_rule.get_nt_index(token)
                qcfg_idxs.append(qcfg_idx)
                # NT placeholders are 1-indexed.
                qcfg_nt = NT_PLACEHOLDER % (num_nts + 1)
                num_nts += 1
                rhs.append(JOINT_NT)
                idx = self.converter.nonterminals_to_ids[qcfg_nt]
                input_symbols.append(
                    cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL))
            else:
                if token not in self.converter.terminals_to_ids:
                    raise ValueError(
                        "token `%s` not in `converter.terminals_to_ids`: %s" %
                        (token, self.converter.terminals_to_ids))
                rhs.append(token)
                idx = self.converter.terminals_to_ids[token]
                terminal_ids.add(idx)
                input_symbols.append(cfg_rule.CFGSymbol(
                    idx, cfg_rule.TERMINAL))

        # Filter rules that contain terminals not in the input.
        def should_include(parser_rule):
            for symbol in parser_rule.rhs:
                if symbol.type == cfg_rule.TERMINAL and symbol.idx not in terminal_ids:
                    return False
            return True

        filtered_rules = [
            rule for rule in self.parser_rules if should_include(rule)
        ]
        if verbose:
            print("filtered_rules:")
            for rule in filtered_rules:
                print(rule)

        def populate_fn(unused_span_begin, unused_span_end, parser_rule,
                        children):
            return [ParseNode(parser_rule, children)]

        nonterminals = set(self.converter.nonterminals_to_ids.values())
        parses = cfg_parser.parse_symbols(
            input_symbols,
            filtered_rules,
            nonterminals,
            nonterminals,
            populate_fn,
            postprocess_fn=None,
            max_single_nt_applications=self.max_single_nt_applications,
            verbose=verbose)
        if not parses:
            print("Could not parse: %s" % (tokens, ))
            return None

        # Extract cfg_nts from parses.
        cfg_nts_set = set()
        for parse_node in parses:
            cfg_nts = _get_cfg_nts(self.converter.nonterminals_to_ids,
                                   self.rhs_nt_rules, parse_node, num_nts)
            cfg_nts = _rearrange_nts(cfg_nts, qcfg_idxs)
            if cfg_nts:
                cfg_nts_set.add(cfg_nts)

        return JointRule(induced_rule, frozenset(cfg_nts_set))