def add_comments_and_ws(rules): # ONE_LINE_COMMENT: '//' .*? '\\r'? '\\n' -> channel(HIDDEN); olc = Antlr4Rule("ONE_LINE_COMMENT", Antlr4Sequence([ Antlr4Symbol("//", True), Antlr4Symbol(".*?", True, is_regex=True), Antlr4Option(Antlr4Symbol("\r", True)), Antlr4Symbol("\n", True), ]), lexer_actions=[Antlr4LexerAction.channel("HIDDEN")]) rules.append(olc) # BLOCK_COMMENT: '/*' .*? '*/' -> channel (HIDDEN); bc = Antlr4Rule("BLOCK_COMMENT", Antlr4Sequence([ Antlr4Symbol("/*", True), Antlr4Symbol(".*?", True, is_regex=True), Antlr4Symbol("*/", True), ]), lexer_actions=[Antlr4LexerAction.channel("HIDDEN")]) rules.append(bc) # WHITE_SPACE: [ \\t\\n\\r] + -> skip; ws = Antlr4Rule("WHITE_SPACE", Antlr4Sequence([ Antlr4Symbol("[ \\t\\n\\r] +", True, is_regex=True), ]), lexer_actions=[Antlr4LexerAction.skip()]) rules.append(ws)
def is_prefix_of_elem(prefix: iAntlr4GramElem, elem: iAntlr4GramElem): """ Chekc if the prefix in prefix of the element :returns: tuple (is_prefix, suffix) """ if not isinstance(prefix, Antlr4Sequence): prefix = Antlr4Sequence([ prefix, ]) if not isinstance(elem, Antlr4Sequence): elem = Antlr4Sequence([ elem, ]) pr_list = list(iter_non_visuals(prefix)) el_list = list(iter_non_visuals(elem)) if len(pr_list) > len(el_list): return (False, None) last_pr = None for el, pr in zip(el_list, pr_list): if not (el == pr): return (False, None) last_pr = pr return (True, elem[elem.index(last_pr) + 1:])
def extract_bin_ops(rules, current_expr_rule, ops_to_extrat, new_rule_name, top_rule_name, handle_conditional_fn, handle_inside_fn): # find option with binary op rule # expr = rule_by_name(rules, "expression") ops_no_special = [ o for o in ops_to_extrat if o not in [ "KW_INSIDE", "KW_DIST", "QUESTIONMARK", ] ] bin_op_choices = [] if len(ops_no_special) > 0: if len(ops_no_special) == 1: op = Antlr4Symbol(ops_no_special[0], False) else: op = Antlr4Selection( [Antlr4Symbol(o, False) for o in ops_no_special]) # expression (binary_operator ( attribute_instance )* expression)* bin_op_choice = Antlr4Sequence([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ op, Antlr4Iteration(Antlr4Symbol("attribute_instance", False)), Antlr4Symbol(top_rule_name, False) ])) ]) bin_op_choices.append(bin_op_choice) if "KW_INSIDE" in ops_to_extrat: handle_inside_fn(bin_op_choices, current_expr_rule) if "KW_DIST" in ops_to_extrat: # handled differently, only allowed on specified places pass if "QUESTIONMARK" in ops_to_extrat: handle_conditional_fn(bin_op_choices, current_expr_rule) # create a new rule which contains rule for extracted binary operators if len(bin_op_choices) > 1: new_body = Antlr4Selection(bin_op_choices) else: new_body = bin_op_choices[0] new_r = Antlr4Rule(new_rule_name, new_body) rules.insert(rules.index(current_expr_rule), new_r) return new_r
def handle_inside_fn(bin_op_choices, current_expr_rule): bin_op_choices[-1].extend([Antlr4Newline(), Antlr4Indent(1)]) # expression (KW_INSIDE LBRACE open_range_list RBRACE)*; bin_op_choice = Antlr4Sequence([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ Antlr4Symbol("KW_INSIDE", False), Antlr4Symbol("LBRACE", False), Antlr4Symbol("open_range_list", False), Antlr4Symbol("RBRACE", False), ])) ]) bin_op_choices.append(bin_op_choice)
def match_replace_fn(o): if isinstance(o, Antlr4Selection): char_symb_to_replace = [] for orig_c in o: c = orig_c c = list(iter_non_visuals(c)) if len(c) > 1: continue c = c[0] if isinstance(c, Antlr4Symbol) and c.is_terminal and len( c.symbol) == 1: char_symb_to_replace.append((orig_c, c)) if len(char_symb_to_replace) > 1: # build an regex out of them # and replace them by the regex for c, _ in char_symb_to_replace: o.remove(c) re_str = "[%s]" % ("".join( [c._escaped() for _, c in char_symb_to_replace])) re = Antlr4Symbol(re_str, True, is_regex=True) if len(list(iter_non_visuals(o))): o.append(re) else: return Antlr4Sequence([ re, ])
def match_replace_fn(o): if isinstance(o, Antlr4Selection): potential_prefix = None potential_prefix_i = None to_remove = [] for i, c in enumerate(o): if potential_prefix is None: potential_prefix = c potential_prefix_i = i else: # check if the potential_prefix is really a prefix of this rule is_prefix, suffix = is_prefix_of_elem(potential_prefix, c) if is_prefix: # put suffix as a optional to a prefix if list(iter_non_visuals(suffix)): if not isinstance(potential_prefix, Antlr4Sequence): assert o[potential_prefix_i] is potential_prefix potential_prefix = Antlr4Sequence([ potential_prefix, ]) o[potential_prefix_i] = potential_prefix if len(suffix) == 1: suffix = suffix[0] else: suffix = Antlr4Sequence(suffix) potential_prefix.append(Antlr4Option(suffix)) to_remove.append(c) potential_prefix = None potential_prefix_i = None modified = True else: potential_prefix = c potential_prefix_i = i for c in to_remove: o.remove(c) if len(o) == 1: return Antlr4Sequence([ o[0], ])
def match_replace_fn(o: iAntlr4GramElem): if isinstance(o, Antlr4Option): items = list(iter_non_visuals(o.body)) if len(items) == 1: s = items[0] if isinstance(s, Antlr4Symbol) and s.symbol == rule_name: return Antlr4Sequence([ s, ])
def rm_ambiguity(rules): rule = rule_by_name(rules, "variable_decl_assignment") to_repl = Antlr4Option( Antlr4Sequence( [Antlr4Symbol("ASSIGN", False), Antlr4Symbol("class_new", False)])) def match_replace_fn(o): if o == to_repl: return o.body replace_item_by_sequence(rule, match_replace_fn)
def handle_conditional_fn(bin_op_choices, current_expr_rule): bin_op_choices.extend([ Antlr4Symbol(current_expr_rule.name, False), Antlr4Iteration( Antlr4Sequence([ Antlr4Symbol("QUESTIONMARK", False), Antlr4Iteration(Antlr4Symbol("attribute_instance", False)), Antlr4Symbol("constant_expression", False), Antlr4Symbol("COLON", False), Antlr4Symbol("constant_expression", False), ])) ])
def _iterate_everything_except_first_and_replace_first(seq, repl): rest = list(iter_non_visuals(seq))[1:] if len(rest) == 1: rest = rest[0] else: rest = Antlr4Sequence(rest) rest_iterated = Antlr4Iteration(rest) seq.clear() seq.append(repl) seq.append(rest_iterated) seq.append(Antlr4Newline()) seq.append(Antlr4Indent(1))
def add_file_path_literal_rules(p): FILE_PATH_SPEC_CHAR = Antlr4Rule( "FILE_PATH_SPEC_CHAR", Antlr4Symbol( "[^ !$`&()+] | ( '\\\\' [ !$`&*()+] )", True, True), is_fragment=True) p.rules.append(FILE_PATH_SPEC_CHAR) file_spec_path = Antlr4Rule( "FILE_PATH_SPEC", Antlr4Iteration(Antlr4Sequence([ Antlr4Symbol("FILE_PATH_SPEC_CHAR", False), Antlr4Option(Antlr4Sequence([ Antlr4Symbol('SEMI', False), Antlr4Symbol("FILE_PATH_SPEC_CHAR", False), ])), ]), positive=True ) ) p.rules.append(file_spec_path)
def parse_element_sequence(self, ctx: Element_sequenceContext) -> Antlr4Sequence: """ element_sequence: element_block (WS element_block)*; """ body = [] for c in ctx.children: if isinstance(c, Element_blockContext): res = self.parse_element_block(c) body.append(res) else: res = self.parse_ws(c) body.extend(res) return Antlr4Sequence(body)
def match_replace_fn(o): if isinstance(o, Antlr4Option): if isinstance(o.body, Antlr4Sequence): items = [] for c in o.body: if isinstance(c, Antlr4Sequence): c = list(iter_non_visuals(c)) if len(c) != 1: return c = c[0] if not isinstance(c, Antlr4Option): return items.append(c) return Antlr4Sequence([ *items, ]) elif isinstance(o.body, Antlr4Option): return o.body
def match_replace_fn(o): if isinstance(o, Antlr4Selection): non_optional_items = [] for c in o: if isinstance(c, Antlr4Sequence): c = list(iter_non_visuals(c)) if len(c) != 1: return c = c[0] if not isinstance(c, Antlr4Option): return non_optional_items.append(c.body) o.clear() o.extend(non_optional_items) modified = True return Antlr4Sequence([ Antlr4Option(o), ])
def extract_option_as_rule(rules, rule_name, options_i, new_rule_name): r = rule_by_name(rules, rule_name) assert isinstance(r.body, Antlr4Selection) new_body = Antlr4Selection([]) for i in options_i: new_body.append(r.body[i]) r.body[options_i[0]] = Antlr4Sequence( [Antlr4Symbol(new_rule_name, False), Antlr4Newline(), Antlr4Indent(1)]) r.body = Antlr4Selection( [x for i, x in enumerate(r.body) if i not in options_i[1:]]) if len(new_body) == 1: new_body = new_body[0] new_r = Antlr4Rule(new_rule_name, new_body) rules.insert(rules.index(r), new_r) return new_r
for s2 in _s: if (s2 not in IGNORED and s2 != Antlr4Newline() and not isinstance(s2, Antlr4Indent)): all_to_remove = False if _s and all_to_remove: s.pop() continue break if r.name == "signature": # rm ()? as it is in ()? every where it is used a, b = r.body[0].body a = a.body b = b.body # ( ( type_mark ( COMMA type_mark )* )? ( RETURN type_mark )? )? r.body = Antlr4Selection([ Antlr4Sequence([a, Antlr4Newline(), Antlr4Indent(1)]), Antlr4Sequence([a, b, Antlr4Newline(), Antlr4Indent(1)]), Antlr4Sequence([b, Antlr4Newline()]), ]) HEADER = """/* * Grammar extracted from the VHDL 1993, 2002, 2008, 2018 standard and then merged together * (the standard is selected by parser property) */ grammar vhdl; """ with open("vhdl.g4", "w") as f: f.write("\n\n")
def _optimise_selections(elm: iAntlr4GramElem): """ Reduce selection options which differ only in single item to a sequence with selection of different items. Example: a: b c d | b e d; to a: b (c | e) d; :note: ignores visuals :note: similar sequences have to be directly after each other because if they were not the priority of choices would be changed """ if isinstance(elm, Antlr4Sequence): modified = False for e in elm: modified = modified or _optimise_selections(e) return modified elif isinstance(elm, Antlr4Selection): # List[Tuple[index of different item, # List[Tuple[index in choices, selection options to replace]]]] to_reduce = [] # tuple (index in choices, value) similar_choices = [] diff_in = None for c_i, choice in enumerate(elm): if not similar_choices: if isinstance( choice, Antlr4Sequence) and len_without_visuals(choice) > 1: similar_choices.append((c_i, choice)) continue else: _, prev = similar_choices[0] compatible = True if (isinstance(prev, Antlr4Sequence) and isinstance(choice, Antlr4Sequence) and len_without_visuals(prev) == len_without_visuals(choice)): # check if differs in a single item for i, (prev_item, current_item) in enumerate( zip(iter_non_visuals(prev), iter_non_visuals(choice))): if prev_item != current_item: if diff_in == i or diff_in is None: diff_in = i else: compatible = False break if compatible: similar_choices.append((c_i, choice)) else: compatible = False if not compatible: if len(similar_choices) > 1: to_reduce.append((diff_in, similar_choices)) # reset search if isinstance(choice, Antlr4Sequence ) and len_without_visuals(choice) > 1: similar_choices = [(c_i, choice)] else: similar_choices = [] diff_in = None if len(similar_choices) > 1: to_reduce.append((diff_in, similar_choices)) offset = 0 for diff_in, _choices in to_reduce: choices = [c[1] for c in _choices] start_i = _choices[0][0] + offset assert len(_choices) > 1 try: assert elm[start_i] is choices[0] except AssertionError: raise diff_item_substitution = Antlr4Selection( [index_non_visual(c, diff_in) for c in choices]) part_to_exclude = index_non_visual(choices[0], diff_in) new_choice = Antlr4Sequence([ (e if e is not part_to_exclude else diff_item_substitution) for e in choices[0] ]) elm[start_i] = new_choice del elm[start_i + 1:start_i + len(choices)] offset -= len(choices) - 1 return len(to_reduce) return False
def remove_useless_and_normalize_names(p): renames = {} for k, v in SvRule2Antlr4Rule.SPEC_SYMB.items(): renames[k] = v # rm_newline_from_simple_rules(p.rules) # nts = get_used_non_terminals(p.rules) # def_nts = get_defined_non_terminals(p.rules) # overspecified # finish_number 0 - 2 replace_rule("finish_number", "UNSIGNED_NUMBER", renames, p) # scalar_constant 1b number replace_rule("scalar_constant", "integral_number", renames, p) # init_val 1b value replace_rule("init_val", "integral_number", renames, p) # edge_descriptor 2 tristate digits # edge_descriptor: '01' | '10' | Z_OR_X ZERO_OR_ONE | ZERO_OR_ONE Z_OR_X; # dpi_spec_string two concrete strings replace_rule("dpi_spec_string", "STRING_LITERAL", renames, p) # #0 -> # UNSIGNED_NUMBER primitive_delay = Antlr4Rule( "primitive_delay", Antlr4Sequence([ Antlr4Symbol("HASH", False), Antlr4Symbol("UNSIGNED_NUMBER", False), ])) p.rules.append(primitive_delay) replace_rule("#0", "primitive_delay", renames, p) # all same ps_identifier_rules = [ "ps_class_identifier", "ps_covergroup_identifier", "ps_checker_identifier", ] for name in ps_identifier_rules: replace_rule(name, "ps_identifier", renames, p) ps_or_hierarchical_id_rules = [ "ps_or_hierarchical_net_identifier", "ps_or_hierarchical_property_identifier", "ps_or_hierarchical_sequence_identifier", "ps_or_hierarchical_tf_identifier", ] ps_or_hierarchical_identifier = Antlr4Rule( "ps_or_hierarchical_identifier", Antlr4Selection([ Antlr4Sequence([ Antlr4Option(Antlr4Symbol("package_scope", False)), Antlr4Symbol("identifier", False) ]), Antlr4Symbol("hierarchical_identifier", False), ])) p.rules.append(ps_or_hierarchical_identifier) for name in ps_or_hierarchical_id_rules: replace_rule(name, "ps_or_hierarchical_identifier", renames, p) to_lexer = [ "c_identifier", "unsigned_number", "simple_identifier", "system_tf_identifier", "unsigned_number", "string_literal", "binary_number", "octal_number", "hex_number", "octal_number", "hex_number", "fixed_point_number", "escaped_identifier", "unbased_unsized_literal", "time_literal", # because it is very hard to switch mode to parse # edge_descriptor and it is easy to just parse coma separated list of 2 chars "edge_control_specifier", "level_symbol", "output_symbol", "edge_symbol", "file_path_spec", ] for tl in to_lexer: renames[tl] = tl.upper() fragments = { "binary_value", "octal_value", "hex_value", "decimal_base", "binary_base", "octal_base", "hex_base", "non_zero_unsigned_number", "size", "sign", "edge_descriptor", "non_zero_decimal_digit", "decimal_digit", "binary_digit", "octal_digit", "hex_digit", "x_digit", "z_digit", "exp", 'white_space', 'zero_or_one', 'z_or_x', 'Any_ASCII_Characters', "any_printable_ASCII_character_except_white_space", "time_unit" } for r in p.rules: if r.name.startswith("$"): renames[r.name] = r.name.replace("$", "dolar_") for fr in fragments: if r.name in fragments: r.is_fragment = True renames[fr] = fr.upper() for r in p.rules: rm_redunt_whitespaces_on_end(r) identifier_rule_equivalents = { r.name for r in collect_simple_rules(p.rules, "identifier") } hierarchical_identifier_rule_equivalents = { r.name for r in collect_simple_rules(p.rules, "hierarchical_identifier") } to_remove = { "comment", "one_line_comment", "block_comment", "comment_text", "white_space", } to_remove.update(identifier_rule_equivalents) to_remove.update(hierarchical_identifier_rule_equivalents) simple_rules_to_remove = [ "default_clause", # default kw "variable_port_type", "limit_value", # used only in more specific limit values "dpi_function_proto", # used only in dpi block so we already know "dpi_task_proto", # used only in dpi block so we already know "property_lvar_port_direction", # used only in property so we already know # "consecutive_repetition", # useless "trans_item", "ordered_parameter_assignment", "function_statement", "case_expression", "case_item_expression", "open_value_range", # used only in open_range_list so we already know "constant_assignment_pattern_expression", # parser do not see the difference between const/non const "clockvar", # used only in clockvar_expression "path_delay_expression", # used only in more specific rules "constant_function_call", # parser do not see the difference between const/non const "function_subroutine_call", "constant_let_expression", # parser do not see the difference between const/non const "attr_name", # used only in attr_spec "array_identifier", # never used "checker_identifier", # used only in rule with same name "class_identifier", "class_variable_identifier", "clocking_identifier", "config_identifier", "const_identifier", "constraint_identifier", "covergroup_identifier", "covergroup_variable_identifier", "cover_point_identifier", "cross_identifier", "enum_identifier", "formal_identifier", "function_identifier", "generate_block_identifier", "genvar_identifier", "hierarchical_array_identifier", "hierarchical_block_identifier", "hierarchical_event_identifier", "hierarchical_net_identifier", "hierarchical_parameter_identifier", "hierarchical_property_identifier", "hierarchical_sequence_identifier", "hierarchical_task_identifier", "hierarchical_tf_identifier", "hierarchical_variable_identifier", "index_variable_identifier", "interface_identifier", "interface_instance_identifier", # "inout_port_identifier", # "input_port_identifier", "instance_identifier", "member_identifier", "method_identifier", "modport_identifier", "module_identifier", "net_identifier", # "output_port_identifier" "package_identifier", "parameter_identifier", "port_identifier", "production_identifier", "program_identifier", "property_identifier", "sequence_identifier", "signal_identifier", "specparam_identifier", "task_identifier", "tf_identifier", "terminal_identifier", "topmodule_identifier", "udp_identifier", "variable_identifier", ] for sr in simple_rules_to_remove: remove_simple_rule(sr, p) p.rules = [r for r in p.rules if r.name not in to_remove] for idname in identifier_rule_equivalents: renames[idname] = "identifier" for idname in hierarchical_identifier_rule_equivalents: renames[idname] = "hierarchical_identifier" apply_rename = generate_renamer(renames, True) for r in p.rules: r.walk(apply_rename) r.walk(mark_regex) for k, v in SvRule2Antlr4Rule.SPEC_SYMB.items(): body = Antlr4Symbol(k, True) r = Antlr4Rule(v, body) p.rules.append(r) # because C_IDENTIFIER is just normal identifier without $ and can match identifiers for r in p.rules: if r.name == "identifier": r.body.insert(0, Antlr4Symbol("C_IDENTIFIER", False))
def match_replace_fn(o): if o == semi: return Antlr4Sequence([])