def parse_element_text(self, ctx: Element_textContext) -> Antlr4Symbol: """ element_text: NAME | TERMINAL; """ n = ctx.NAME() if n is not None: return Antlr4Symbol(n.getText(), False) else: n = ctx.TERMINAL().getText() n = n[len("<b>"):-len("</b>")] return Antlr4Symbol(n, True)
def rm_ambiguity(rules): rule = rule_by_name(rules, "variable_decl_assignment") to_repl = Antlr4Option( Antlr4Sequence( [Antlr4Symbol("ASSIGN", False), Antlr4Symbol("class_new", False)])) def match_replace_fn(o): if o == to_repl: return o.body replace_item_by_sequence(rule, match_replace_fn)
def extract_bin_ops(rules, current_expr_rule, ops_to_extrat, new_rule_name, top_rule_name, handle_conditional_fn, handle_inside_fn): # find option with binary op rule # expr = rule_by_name(rules, "expression") ops_no_special = [ o for o in ops_to_extrat if o not in [ "KW_INSIDE", "KW_DIST", "QUESTIONMARK", ] ] bin_op_choices = [] if len(ops_no_special) > 0: if len(ops_no_special) == 1: op = Antlr4Symbol(ops_no_special[0], False) else: op = Antlr4Selection( [Antlr4Symbol(o, False) for o in ops_no_special]) # expression (binary_operator ( attribute_instance )* expression)* bin_op_choice = Antlr4Sequence([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ op, Antlr4Iteration(Antlr4Symbol("attribute_instance", False)), Antlr4Symbol(top_rule_name, False) ])) ]) bin_op_choices.append(bin_op_choice) if "KW_INSIDE" in ops_to_extrat: handle_inside_fn(bin_op_choices, current_expr_rule) if "KW_DIST" in ops_to_extrat: # handled differently, only allowed on specified places pass if "QUESTIONMARK" in ops_to_extrat: handle_conditional_fn(bin_op_choices, current_expr_rule) # create a new rule which contains rule for extracted binary operators if len(bin_op_choices) > 1: new_body = Antlr4Selection(bin_op_choices) else: new_body = bin_op_choices[0] new_r = Antlr4Rule(new_rule_name, new_body) rules.insert(rules.index(current_expr_rule), new_r) return new_r
def subroutine_call_rm_lr(rules): r = rule_by_name(rules, "subroutine_call") assert isinstance(r.body, Antlr4Selection) c = r.body[2] _body = list(iter_non_visuals(c)) assert _body[-1].symbol == "method_call_body", _body[-1].symbol start: Antlr4Selection = _body[0] start.clear() start.extend([ Antlr4Symbol("primary_no_cast_no_call", False), Antlr4Symbol("cast", False), Antlr4Symbol("implicit_class_handle", False) ])
def add_interface_class_declaration(rules): """ Because interface_class_definition is not used anywhere (is missing in specified rules) """ intf = Antlr4Symbol("interface_class_declaration", False) cls = Antlr4Symbol("class_declaration", False) def match_replace_fn(o): if o == cls: return Antlr4Selection([o, deepcopy(intf)]) for rule in rules: replace_item_by_sequence(rule, match_replace_fn)
def add_comments_and_ws(rules): # ONE_LINE_COMMENT: '//' .*? '\\r'? '\\n' -> channel(HIDDEN); olc = Antlr4Rule("ONE_LINE_COMMENT", Antlr4Sequence([ Antlr4Symbol("//", True), Antlr4Symbol(".*?", True, is_regex=True), Antlr4Option(Antlr4Symbol("\r", True)), Antlr4Symbol("\n", True), ]), lexer_actions=[Antlr4LexerAction.channel("HIDDEN")]) rules.append(olc) # BLOCK_COMMENT: '/*' .*? '*/' -> channel (HIDDEN); bc = Antlr4Rule("BLOCK_COMMENT", Antlr4Sequence([ Antlr4Symbol("/*", True), Antlr4Symbol(".*?", True, is_regex=True), Antlr4Symbol("*/", True), ]), lexer_actions=[Antlr4LexerAction.channel("HIDDEN")]) rules.append(bc) # WHITE_SPACE: [ \\t\\n\\r] + -> skip; ws = Antlr4Rule("WHITE_SPACE", Antlr4Sequence([ Antlr4Symbol("[ \\t\\n\\r] +", True, is_regex=True), ]), lexer_actions=[Antlr4LexerAction.skip()]) rules.append(ws)
def handle_inside_fn(bin_op_choices, current_expr_rule): bin_op_choices[-1].extend([Antlr4Newline(), Antlr4Indent(1)]) # expression (KW_INSIDE LBRACE open_range_list RBRACE)*; bin_op_choice = Antlr4Sequence([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ Antlr4Symbol("KW_INSIDE", False), Antlr4Symbol("LBRACE", False), Antlr4Symbol("open_range_list", False), Antlr4Symbol("RBRACE", False), ])) ]) bin_op_choices.append(bin_op_choice)
def match_replace_fn(o): if isinstance(o, Antlr4Selection): char_symb_to_replace = [] for orig_c in o: c = orig_c c = list(iter_non_visuals(c)) if len(c) > 1: continue c = c[0] if isinstance(c, Antlr4Symbol) and c.is_terminal and len( c.symbol) == 1: char_symb_to_replace.append((orig_c, c)) if len(char_symb_to_replace) > 1: # build an regex out of them # and replace them by the regex for c, _ in char_symb_to_replace: o.remove(c) re_str = "[%s]" % ("".join( [c._escaped() for _, c in char_symb_to_replace])) re = Antlr4Symbol(re_str, True, is_regex=True) if len(list(iter_non_visuals(o))): o.append(re) else: return Antlr4Sequence([ re, ])
def fix_priority_of__class_scope__package_scope(rules): orig = Antlr4Selection([ Antlr4Symbol("class_scope", False), Antlr4Symbol("package_scope", False) ]) repl = Antlr4Selection([ Antlr4Symbol("package_scope", False), Antlr4Symbol("class_scope", False) ]) def match_replace_fn(o): if o == orig: return deepcopy(repl) for rule in rules: replace_item_by_sequence(rule, match_replace_fn)
def extract_keywords_to_specific_rule(p: SvRule2Antlr4Rule): keywords = set() def collect_keywords(obj): if isinstance(obj, Antlr4Symbol) and obj.is_terminal: s = obj.symbol keywords.add(s) for r in p.rules: if not r.is_lexer_rule(): r.walk(collect_keywords) def get_kw_name(k): return "KW_" + k.replace("$", "DOLAR_").upper() def renamer(obj: iAntlr4GramElem): if isinstance(obj, Antlr4Symbol) and obj.is_terminal\ and obj.symbol in keywords: obj.is_terminal = False obj.symbol = get_kw_name(obj.symbol) for r in p.rules: if not r.is_lexer_rule(): r.walk(renamer) for k in sorted(keywords): kw_name = get_kw_name(k) kw_rule = Antlr4Rule(kw_name, Antlr4Symbol(k, True)) p.rules.append(kw_rule)
def split_rule(rules, rule_name, symbols_to_extract: List[str], subrule_name: str): """ Let only options which are starting with symbols from symbols_to_extract. Put the rest to a subrule. """ r = rule_by_name(rules, rule_name) assert isinstance(r.body, Antlr4Selection), r sub_options = Antlr4Selection([]) for o in r.body: start_symbols = set() _direct_left_corner(o, start_symbols, allow_eps_in_sel=True) if not start_symbols.intersection(symbols_to_extract): sub_options.append(o) r.body = Antlr4Selection([o for o in r.body if not (o in sub_options)]) r.body.insert(0, Antlr4Symbol(subrule_name, False)) if len(r.body) == 1: r.body = r.body[0] assert len(sub_options) > 0 if len(sub_options) == 1: sub_options = sub_options[0] else: sub_options = Antlr4Selection(sub_options) sub_r = Antlr4Rule(subrule_name, sub_options) rules.insert(rules.index(r), sub_r) return sub_r
def rm_semi_from_cross_body_item(rules): """ Because SEMI is already part of cross_body_item """ rule = rule_by_name(rules, "cross_body") semi = Antlr4Symbol("SEMI", False) def match_replace_fn(o): if o == semi: return Antlr4Sequence([]) replace_item_by_sequence(rule.body[0], match_replace_fn)
def direct_left_recurse_rm(rules, rule_name): r = rule_by_name(rules, rule_name) if isinstance(r.body, Antlr4Selection): choices = r.body elif isinstance(r.body, Antlr4Sequence): choices = [r.body, ] else: raise NotImplementedError() # find choices which starts with this rule non terminal lr_choices = [] for c in choices: if isinstance(c, Antlr4Sequence): first = next(iter_non_visuals(c)) if isinstance(first, Antlr4Symbol) and first.symbol == rule_name: lr_choices.append(c) else: raise NotImplementedError() # remove choices which are causing left recursion assert len(lr_choices) >= 1, rule_name for lr_choice in lr_choices: choices.remove(lr_choice) if len(choices) == 0: raise NotImplementedError() elif len(choices) == 1: r.body = choices[0] # renaame this rule to rule_item r_base_name = r.name + "_item" for _r in rules: assert r.name != r_base_name, r_base_name r.name = r_base_name # create new rule which will implement removed choices and also expands to rule_item choices_new = Antlr4Selection([]) for lr_choice in lr_choices: first = next(iter_non_visuals(lr_choice)) assert isinstance(first, Antlr4Symbol) and first.symbol == rule_name repl = Antlr4Symbol(r_base_name, False) _iterate_everything_except_first_and_replace_first(lr_choice, repl) if not choices_new: lr_choice.insert(0, Antlr4Newline()) lr_choice.insert(1, Antlr4Indent(1)) choices_new.append(lr_choice) body_new = choices_new[0] if len(choices_new) == 1 else choices_new r_new = Antlr4Rule(rule_name, body_new) rules.insert(rules.index(r), r_new)
def add_file_path_literal_rules(p): FILE_PATH_SPEC_CHAR = Antlr4Rule( "FILE_PATH_SPEC_CHAR", Antlr4Symbol( "[^ !$`&()+] | ( '\\\\' [ !$`&*()+] )", True, True), is_fragment=True) p.rules.append(FILE_PATH_SPEC_CHAR) file_spec_path = Antlr4Rule( "FILE_PATH_SPEC", Antlr4Iteration(Antlr4Sequence([ Antlr4Symbol("FILE_PATH_SPEC_CHAR", False), Antlr4Option(Antlr4Sequence([ Antlr4Symbol('SEMI', False), Antlr4Symbol("FILE_PATH_SPEC_CHAR", False), ])), ]), positive=True ) ) p.rules.append(file_spec_path)
def handle_conditional_fn(bin_op_choices, current_expr_rule): # rm left recursion from cond_predicate/conditional_expression replace_symbol_in_rule(rules, "conditional_expression", "cond_predicate", "cond_expr_predicate", only_first=True) iterate_everything_except_first(rules, "conditional_expression") # create new cond_predicate (cond_expr_predicate) whout left recursion cond_predicate = rule_by_name(rules, "cond_predicate") cond_expr_predicate = deepcopy(cond_predicate) cond_expr_predicate.name = "cond_expr_predicate" rules.insert(rules.index(cond_predicate), cond_expr_predicate) replace_symbol_in_rule(rules, "cond_expr_predicate", "expression", current_expr_rule.name, only_first=True) bin_op_choices.extend([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Symbol("conditional_expression", False) ])
def handle_conditional_fn(bin_op_choices, current_expr_rule): bin_op_choices.extend([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ Antlr4Symbol("QUESTIONMARK", False), Antlr4Iteration(Antlr4Symbol("attribute_instance", False)), Antlr4Symbol("constant_expression", False), Antlr4Symbol("COLON", False), Antlr4Symbol("constant_expression", False), ])) ])
def extract_option_as_rule(rules, rule_name, options_i, new_rule_name): r = rule_by_name(rules, rule_name) assert isinstance(r.body, Antlr4Selection) new_body = Antlr4Selection([]) for i in options_i: new_body.append(r.body[i]) r.body[options_i[0]] = Antlr4Sequence( [Antlr4Symbol(new_rule_name, False), Antlr4Newline(), Antlr4Indent(1)]) r.body = Antlr4Selection( [x for i, x in enumerate(r.body) if i not in options_i[1:]]) if len(new_body) == 1: new_body = new_body[0] new_r = Antlr4Rule(new_rule_name, new_body) rules.insert(rules.index(r), new_r) return new_r
def remove_useless_and_normalize_names(p): renames = {} for k, v in SvRule2Antlr4Rule.SPEC_SYMB.items(): renames[k] = v # rm_newline_from_simple_rules(p.rules) # nts = get_used_non_terminals(p.rules) # def_nts = get_defined_non_terminals(p.rules) # overspecified # finish_number 0 - 2 replace_rule("finish_number", "UNSIGNED_NUMBER", renames, p) # scalar_constant 1b number replace_rule("scalar_constant", "integral_number", renames, p) # init_val 1b value replace_rule("init_val", "integral_number", renames, p) # edge_descriptor 2 tristate digits # edge_descriptor: '01' | '10' | Z_OR_X ZERO_OR_ONE | ZERO_OR_ONE Z_OR_X; # dpi_spec_string two concrete strings replace_rule("dpi_spec_string", "STRING_LITERAL", renames, p) # #0 -> # UNSIGNED_NUMBER primitive_delay = Antlr4Rule( "primitive_delay", Antlr4Sequence([ Antlr4Symbol("HASH", False), Antlr4Symbol("UNSIGNED_NUMBER", False), ])) p.rules.append(primitive_delay) replace_rule("#0", "primitive_delay", renames, p) # all same ps_identifier_rules = [ "ps_class_identifier", "ps_covergroup_identifier", "ps_checker_identifier", ] for name in ps_identifier_rules: replace_rule(name, "ps_identifier", renames, p) ps_or_hierarchical_id_rules = [ "ps_or_hierarchical_net_identifier", "ps_or_hierarchical_property_identifier", "ps_or_hierarchical_sequence_identifier", "ps_or_hierarchical_tf_identifier", ] ps_or_hierarchical_identifier = Antlr4Rule( "ps_or_hierarchical_identifier", Antlr4Selection([ Antlr4Sequence([ Antlr4Option(Antlr4Symbol("package_scope", False)), Antlr4Symbol("identifier", False) ]), Antlr4Symbol("hierarchical_identifier", False), ])) p.rules.append(ps_or_hierarchical_identifier) for name in ps_or_hierarchical_id_rules: replace_rule(name, "ps_or_hierarchical_identifier", renames, p) to_lexer = [ "c_identifier", "unsigned_number", "simple_identifier", "system_tf_identifier", "unsigned_number", "string_literal", "binary_number", "octal_number", "hex_number", "octal_number", "hex_number", "fixed_point_number", "escaped_identifier", "unbased_unsized_literal", "time_literal", # because it is very hard to switch mode to parse # edge_descriptor and it is easy to just parse coma separated list of 2 chars "edge_control_specifier", "level_symbol", "output_symbol", "edge_symbol", "file_path_spec", ] for tl in to_lexer: renames[tl] = tl.upper() fragments = { "binary_value", "octal_value", "hex_value", "decimal_base", "binary_base", "octal_base", "hex_base", "non_zero_unsigned_number", "size", "sign", "edge_descriptor", "non_zero_decimal_digit", "decimal_digit", "binary_digit", "octal_digit", "hex_digit", "x_digit", "z_digit", "exp", 'white_space', 'zero_or_one', 'z_or_x', 'Any_ASCII_Characters', "any_printable_ASCII_character_except_white_space", "time_unit" } for r in p.rules: if r.name.startswith("$"): renames[r.name] = r.name.replace("$", "dolar_") for fr in fragments: if r.name in fragments: r.is_fragment = True renames[fr] = fr.upper() for r in p.rules: rm_redunt_whitespaces_on_end(r) identifier_rule_equivalents = { r.name for r in collect_simple_rules(p.rules, "identifier") } hierarchical_identifier_rule_equivalents = { r.name for r in collect_simple_rules(p.rules, "hierarchical_identifier") } to_remove = { "comment", "one_line_comment", "block_comment", "comment_text", "white_space", } to_remove.update(identifier_rule_equivalents) to_remove.update(hierarchical_identifier_rule_equivalents) simple_rules_to_remove = [ "default_clause", # default kw "variable_port_type", "limit_value", # used only in more specific limit values "dpi_function_proto", # used only in dpi block so we already know "dpi_task_proto", # used only in dpi block so we already know "property_lvar_port_direction", # used only in property so we already know # "consecutive_repetition", # useless "trans_item", "ordered_parameter_assignment", "function_statement", "case_expression", "case_item_expression", "open_value_range", # used only in open_range_list so we already know "constant_assignment_pattern_expression", # parser do not see the difference between const/non const "clockvar", # used only in clockvar_expression "path_delay_expression", # used only in more specific rules "constant_function_call", # parser do not see the difference between const/non const "function_subroutine_call", "constant_let_expression", # parser do not see the difference between const/non const "attr_name", # used only in attr_spec "array_identifier", # never used "checker_identifier", # used only in rule with same name "class_identifier", "class_variable_identifier", "clocking_identifier", "config_identifier", "const_identifier", "constraint_identifier", "covergroup_identifier", "covergroup_variable_identifier", "cover_point_identifier", "cross_identifier", "enum_identifier", "formal_identifier", "function_identifier", "generate_block_identifier", "genvar_identifier", "hierarchical_array_identifier", "hierarchical_block_identifier", "hierarchical_event_identifier", "hierarchical_net_identifier", "hierarchical_parameter_identifier", "hierarchical_property_identifier", "hierarchical_sequence_identifier", "hierarchical_task_identifier", "hierarchical_tf_identifier", "hierarchical_variable_identifier", "index_variable_identifier", "interface_identifier", "interface_instance_identifier", # "inout_port_identifier", # "input_port_identifier", "instance_identifier", "member_identifier", "method_identifier", "modport_identifier", "module_identifier", "net_identifier", # "output_port_identifier" "package_identifier", "parameter_identifier", "port_identifier", "production_identifier", "program_identifier", "property_identifier", "sequence_identifier", "signal_identifier", "specparam_identifier", "task_identifier", "tf_identifier", "terminal_identifier", "topmodule_identifier", "udp_identifier", "variable_identifier", ] for sr in simple_rules_to_remove: remove_simple_rule(sr, p) p.rules = [r for r in p.rules if r.name not in to_remove] for idname in identifier_rule_equivalents: renames[idname] = "identifier" for idname in hierarchical_identifier_rule_equivalents: renames[idname] = "hierarchical_identifier" apply_rename = generate_renamer(renames, True) for r in p.rules: r.walk(apply_rename) r.walk(mark_regex) for k, v in SvRule2Antlr4Rule.SPEC_SYMB.items(): body = Antlr4Symbol(k, True) r = Antlr4Rule(v, body) p.rules.append(r) # because C_IDENTIFIER is just normal identifier without $ and can match identifiers for r in p.rules: if r.name == "identifier": r.body.insert(0, Antlr4Symbol("C_IDENTIFIER", False))
renames = { k: k.upper() for k in [ "base_specifier", "lower_case_letter", "upper_case_letter", 'special_character', 'other_special_character', 'digit', 'format_effector', 'space_character', 'underline' ] } renames["mode"] = "signal_mode" renames["E"] = "E_SIGN" renames["NULL"] = "NULL_SYM" for k, v in VhdlRule2Antlr4Rule.SPEC_SYMB.items(): renames[k] = v IGNORED = [ Antlr4Symbol(s, False) for s in [ "Property_Declaration", "Sequence_Declaration", "Clock_Declaration", "PSL_Directive", "Verification_Unit", ] ] with open("vhdl2008.g4_proto") as f: p = VhdlRule2Antlr4Rule() p.convert(f) rm_newline_from_simple_rules(p.rules) nts = get_used_non_terminals(p.rules) def_nts = get_defined_non_terminals(p.rules) keywords = set()
def add_string_literal_rules(p): string_char = Antlr4Rule("ANY_ASCII_CHARACTERS", Antlr4Selection([ Antlr4Symbol('~["\\\\\\r\\n]', True, True), Antlr4Symbol('\\\n', True), Antlr4Symbol('\\\r\n', True), Antlr4Symbol('\t', True), Antlr4Symbol('\\\\', True), Antlr4Symbol('\v', True), Antlr4Symbol('\f', True), Antlr4Symbol('\a', True), Antlr4Symbol("'\\\\' [0-9] [0-9]? [0-9]?", True, True), Antlr4Symbol("'\\\\' 'x' [0-9A-Fa-f] [0-9A-Fa-f]?", True, True), ]), is_fragment=True) p.rules.append(string_char) any_printable_ASCII_character_except_white_space = Antlr4Rule( "ANY_PRINTABLE_ASCII_CHARACTER_EXCEPT_WHITE_SPACE", Antlr4Symbol("'\\u0021'..'\\u007E'", True, True), is_fragment=True) p.rules.append(any_printable_ASCII_character_except_white_space)