def test_parse_3(self): # NT -> FOO NT rule_1 = cfg_rule.CFGRule( idx=0, lhs=NT, rhs=( cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), )) input_symbols = [ cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), ] parses = cfg_parser.parse_symbols( input_symbols, [rule_1], {NT}, {NT}, _populate_fn, _postprocess_fn, verbose=True) self.assertLen(parses, 1) parse_node = parses[0] self.assertEqual(parse_node, [(0, 3, 0), (1, 3, 0)])
def test_parse_6(self): # NT -> NT_2 BAR rule_1 = cfg_rule.CFGRule( idx=0, lhs=NT, rhs=( cfg_rule.CFGSymbol(NT_2, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL), )) # NT_2 -> NT BAR rule_2 = cfg_rule.CFGRule( idx=1, lhs=NT_2, rhs=( cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL), )) input_symbols = [ cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL), ] parses = cfg_parser.parse_symbols( input_symbols, [rule_1, rule_2], {NT, NT_2}, {NT, NT_2}, _populate_fn, _postprocess_fn, verbose=True) self.assertLen(parses, 1) parse_node = parses[0] self.assertEqual(parse_node, [(0, 2, 1)])
def can_parse(self, tokens, verbose=False): """Return True if can be parsed given target CFG.""" input_symbols = [] terminal_ids = set() for token in tokens: if qcfg_rule.is_nt(token): idx = self.converter.nonterminals_to_ids[PLACEHOLDER_NT] input_symbols.append( cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL)) else: if token not in self.converter.terminals_to_ids: if verbose: print( "token `%s` not in `converter.terminals_to_ids`: %s" % (token, self.converter.terminals_to_ids)) return False idx = self.converter.terminals_to_ids[token] terminal_ids.add(idx) input_symbols.append(cfg_rule.CFGSymbol( idx, cfg_rule.TERMINAL)) # Filter rules that contain terminals not in the input. def should_include(parser_rule): for symbol in parser_rule.rhs: if symbol.type == cfg_rule.TERMINAL and symbol.idx not in terminal_ids: return False return True filtered_rules = [ rule for rule in self.parser_rules if should_include(rule) ] if verbose: print("filtered_rules:") for rule in filtered_rules: print(rule) def populate_fn(unused_span_begin, unused_span_end, unused_parser_rule, unused_children): return [True] nonterminals = set(self.converter.nonterminals_to_ids.values()) parses = cfg_parser.parse_symbols(input_symbols, filtered_rules, nonterminals, nonterminals, populate_fn, postprocess_fn=None, max_single_nt_applications=2, verbose=verbose) if parses: return True else: return False
def can_parse(target_string, rules, max_single_nt_applications=2, verbose=False): """Returns True if there exists >=1 parse of target_string given rules.""" tokens = target_string.split(" ") # Convert rules. converter = cfg_converter.CFGRuleConverter() parser_rules = [] for rule_idx, rule in enumerate(rules): parser_rule = converter.convert_to_cfg_rule( lhs=rule.lhs, rhs=rule.rhs.split(" "), rule_idx=rule_idx, nonterminal_prefix=NON_TERMINAL_PREFIX, allowed_terminals=set(tokens)) if parser_rule: parser_rules.append(parser_rule) start_idx = converter.nonterminals_to_ids[ROOT_SYMBOL] nonterminals = converter.nonterminals_to_ids.values() input_symbols = [] for token in tokens: if token.startswith(NON_TERMINAL_PREFIX): idx = converter.nonterminals_to_ids[token[len(NON_TERMINAL_PREFIX):]] input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL)) else: if token not in converter.terminals_to_ids: return False idx = converter.terminals_to_ids[token] input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL)) # Run parser. parses = cfg_parser.parse_symbols( input_symbols, parser_rules, nonterminals, {start_idx}, _populate_fn, _postprocess_fn, verbose=verbose, max_single_nt_applications=max_single_nt_applications) if parses: return True else: return False
def test_parse_4(self): # NT -> BAR rule_1 = cfg_rule.CFGRule( idx=0, lhs=NT, rhs=(cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL),)) # NT -> NT FOO NT rule_2 = cfg_rule.CFGRule( idx=1, lhs=NT, rhs=( cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), )) # NT -> NT FOO BAR rule_3 = cfg_rule.CFGRule( idx=2, lhs=NT, rhs=( cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL), )) input_symbols = [ cfg_rule.CFGSymbol(NT, cfg_rule.NON_TERMINAL), cfg_rule.CFGSymbol(FOO, cfg_rule.TERMINAL), cfg_rule.CFGSymbol(BAR, cfg_rule.TERMINAL), ] parses = cfg_parser.parse_symbols( input_symbols, [rule_1, rule_2, rule_3], {NT}, {NT}, _populate_fn, _postprocess_fn, verbose=True) self.assertLen(parses, 2) self.assertEqual(parses, [[(0, 3, 2)], [(0, 3, 1), (2, 3, 0)]])
def parse(tokens, rules, node_fn, postprocess_cell_fn, max_single_nt_applications=1, verbose=False): """Run bottom up parser. Args: tokens: List of strings for input (terminals or nonterminals). rules: List of QCFGRule instances. node_fn: Function with input arguments (span_begin, span_end, rule, children) and returns a "node". postprocess_cell_fn: Function from a list of "nodes" to "nodes". max_single_nt_applications: The maximum number of times a rule where the RHS is a single nonterminal symbol can be applied consecutively. verbose: Print debug output if True. Returns: A List of "node" objects for completed parses. """ if verbose: print("tokens: %s" % (tokens, )) print("rules:") for rule in rules: print(str(rule)) # Our QCFG grammars always use a single NT symbol. nt_idx = 0 # Convert to ParserRule format. converter = cfg_converter.CFGRuleConverter() idx_to_rule = {} parser_rules = [] rule_idx = 0 allowed_terminals = set(tokens) for rule in rules: if not qcfg_rule.is_allowed(rule.source, allowed_terminals): continue rhs = _convert_nt(rule.source) parser_rule = converter.convert_to_cfg_rule( lhs=NT_IDX, rhs=rhs, rule_idx=rule_idx, nonterminal_prefix=NON_TERMINAL_PREFIX) parser_rules.append(parser_rule) idx_to_rule[rule_idx] = rule rule_idx += 1 for token in tokens: if not qcfg_rule.is_nt( token) and token not in converter.terminals_to_ids: if verbose: print("Input token does not appear in rules: %s" % token) return [] input_symbols = [] for token in tokens: if qcfg_rule.is_nt(token): input_symbols.append( cfg_rule.CFGSymbol(nt_idx, cfg_rule.NON_TERMINAL)) else: idx = converter.terminals_to_ids[token] input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL)) # Wrap node_fn to pass original Rule instead of CFGRule. def populate_fn(span_begin, span_end, parser_rule, children): rule = idx_to_rule[parser_rule.idx] node = node_fn(span_begin, span_end, rule, children) return [node] nonterminals = {nt_idx} start_idx = nt_idx if verbose: print("parser_rules: %s" % parser_rules) parses = cfg_parser.parse_symbols( input_symbols, parser_rules, nonterminals, {start_idx}, populate_fn, postprocess_cell_fn, max_single_nt_applications=max_single_nt_applications, verbose=verbose) return parses
def parse(tokens, rules, node_fn, postprocess_fn, verbose=False): """Run bottom up parser on QCFG target using target CFG. Args: tokens: List of strings for input. rules: List of TargetCfgRule instances. node_fn: Function with input arguments (span_begin, span_end, rule, children) and returns a list of "node". postprocess_fn: Function from a list of "nodes" to "nodes". verbose: Print debug output if True. Returns: A List of "node" objects for completed parses. """ if verbose: print("tokens: %s" % (tokens, )) print("rules:") for rule in rules: print(str(rule)) terminals = [ token for token in tokens if not token.startswith(qcfg_rule.NON_TERMINAL_PREFIX) ] # Convert rules. converter = cfg_converter.CFGRuleConverter() parser_rules = [] idx_to_rule = {} rule_idx = 0 for rule in rules: parser_rule = converter.convert_to_cfg_rule( lhs=rule.lhs, rhs=rule.rhs.split(" "), rule_idx=rule_idx, nonterminal_prefix=target_grammar.NON_TERMINAL_PREFIX, allowed_terminals=set(terminals)) if parser_rule: parser_rules.append(parser_rule) idx_to_rule[rule_idx] = rule rule_idx += 1 # Add rules for every target nonterminal and QCFG nonterminal target_nts = set(converter.nonterminals_to_ids.keys()) qcfg_nts = set(qcfg_rule.get_nts(tokens)) for target_nt in target_nts: for qcfg_nt in qcfg_nts: rule = target_grammar.TargetCfgRule(target_nt, _convert_qcfg_nt(qcfg_nt)) parser_rule = converter.convert_to_cfg_rule( lhs=rule.lhs, rhs=rule.rhs.split(" "), rule_idx=rule_idx, nonterminal_prefix=target_grammar.NON_TERMINAL_PREFIX) parser_rules.append(parser_rule) idx_to_rule[rule_idx] = rule rule_idx += 1 input_symbols = [] for token in tokens: if qcfg_rule.is_nt(token): if token not in converter.nonterminals_to_ids: return [] idx = converter.nonterminals_to_ids[token] input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL)) else: if token not in converter.terminals_to_ids: return [] idx = converter.terminals_to_ids[token] input_symbols.append(cfg_rule.CFGSymbol(idx, cfg_rule.TERMINAL)) # Wrap node_fn to pass original Rule instead of CFGRule. def populate_fn(span_begin, span_end, parser_rule, children): rule = idx_to_rule[parser_rule.idx] nodes = node_fn(span_begin, span_end, rule, children) return nodes nonterminals = set(converter.nonterminals_to_ids.values()) if verbose: print("parser_rules: %s" % parser_rules) parses = cfg_parser.parse_symbols(input_symbols, parser_rules, nonterminals, nonterminals, populate_fn, postprocess_fn, max_single_nt_applications=0, verbose=verbose) return parses
def convert(self, induced_rule, verbose=False): """Convert QCFGRule to JointRule.""" tokens = induced_rule.target input_symbols = [] terminal_ids = set() qcfg_idxs = [] rhs = [] num_nts = 0 for token in tokens: if qcfg_rule.is_nt(token): qcfg_idx = qcfg_rule.get_nt_index(token) qcfg_idxs.append(qcfg_idx) # NT placeholders are 1-indexed. qcfg_nt = NT_PLACEHOLDER % (num_nts + 1) num_nts += 1 rhs.append(JOINT_NT) idx = self.converter.nonterminals_to_ids[qcfg_nt] input_symbols.append( cfg_rule.CFGSymbol(idx, cfg_rule.NON_TERMINAL)) else: if token not in self.converter.terminals_to_ids: raise ValueError( "token `%s` not in `converter.terminals_to_ids`: %s" % (token, self.converter.terminals_to_ids)) rhs.append(token) idx = self.converter.terminals_to_ids[token] terminal_ids.add(idx) input_symbols.append(cfg_rule.CFGSymbol( idx, cfg_rule.TERMINAL)) # Filter rules that contain terminals not in the input. def should_include(parser_rule): for symbol in parser_rule.rhs: if symbol.type == cfg_rule.TERMINAL and symbol.idx not in terminal_ids: return False return True filtered_rules = [ rule for rule in self.parser_rules if should_include(rule) ] if verbose: print("filtered_rules:") for rule in filtered_rules: print(rule) def populate_fn(unused_span_begin, unused_span_end, parser_rule, children): return [ParseNode(parser_rule, children)] nonterminals = set(self.converter.nonterminals_to_ids.values()) parses = cfg_parser.parse_symbols( input_symbols, filtered_rules, nonterminals, nonterminals, populate_fn, postprocess_fn=None, max_single_nt_applications=self.max_single_nt_applications, verbose=verbose) if not parses: print("Could not parse: %s" % (tokens, )) return None # Extract cfg_nts from parses. cfg_nts_set = set() for parse_node in parses: cfg_nts = _get_cfg_nts(self.converter.nonterminals_to_ids, self.rhs_nt_rules, parse_node, num_nts) cfg_nts = _rearrange_nts(cfg_nts, qcfg_idxs) if cfg_nts: cfg_nts_set.add(cfg_nts) return JointRule(induced_rule, frozenset(cfg_nts_set))