def disco_loop(opc, version, queue, real_out, dup_lines=False, asm_format="classic"): """Disassembles a queue of code objects. If we discover another code object which will be found in co_consts, we add the new code to the list. Note that the order of code discovery is in the order of first encountered which is not amenable for the format used by a disassembler where code objects should be defined before using them in other functions. However this is not recursive and will overall lead to less memory consumption at run time. """ while len(queue) > 0: co = queue.popleft() if co.co_name not in ("<module>", "?"): real_out.write("\n" + format_code_info(co, version) + "\n") bytecode = Bytecode(co, opc, dup_lines=dup_lines) real_out.write(bytecode.dis(asm_format=asm_format) + "\n") for c in co.co_consts: if iscode(c): queue.append(c) pass pass
def disco_loop(opc, version, queue, real_out, dup_lines=False, show_bytes=False): """Disassembles a queue of code objects. If we discover another code object which will be found in co_consts, we add the new code to the list. Note that the order of code discovery is in the order of first encountered which is not amenable for the format used by a disassembler where code objects should be defined before using them in other functions. However this is not recursive and will overall lead to less memory consumption at run time. """ while len(queue) > 0: co = queue.popleft() if co.co_name not in ('<module>', '?'): real_out.write("\n" + format_code_info(co, version) + "\n") bytecode = Bytecode(co, opc, dup_lines=dup_lines) real_out.write(bytecode.dis(show_bytes=show_bytes) + "\n") for c in co.co_consts: if iscode(c): queue.append(c) pass pass
def do_disassembly(self, func, expected): co = func.__code__ bytecode = Bytecode(co, opc) got = bytecode.dis() # Trim trailing blanks (if any). lines = got.split('\n') lines = [line.rstrip() for line in lines] expected = expected.split("\n") import difflib if expected != lines: self.fail("events did not match expectation:\n" + "\n".join(difflib.ndiff(expected, lines)))
def are_code_objects_equal(co1, co2): """ Determine if two code objects are approximately equal, see are_instructions_equal for more information. :param i1: left code object to compare :param i2: right code object to compare :return: True if the two code objects are approximately equal, otherwise False. """ instructions1 = Bytecode(co1) instructions2 = Bytecode(co2) for opcode1, opcode2 in zip(instructions1, instructions2): if not are_instructions_equal(opcode1, opcode2): return False return True
def disco_loop_asm_format(opc, version, co, real_out): """Produces disassembly in a format more conducive to automatic assembly by producing inner modules before they are used by outer ones. Since this is recusive, we'll use more stack space at runtime. """ for c in co.co_consts: if iscode(c): disco_loop_asm_format(opc, version, c, real_out) pass if co.co_name != '<module>' or co.co_filename: real_out.write("\n" + format_code_info(co, version) + "\n") bytecode = Bytecode(co, opc) real_out.write(bytecode.dis(asm_format=True) + "\n")
def do_disassembly(self, func, expected): co = func.__code__ bytecode = Bytecode(co, opc) got = bytecode.dis() # Trim trailing blanks (if any). lines = got.split('\n') lines = [line.rstrip() for line in lines] expected = expected.split("\n") import difflib if expected != lines: self.fail( "events did not match expectation:\n" + "\n".join(difflib.ndiff(expected, lines)))
def test_inst_size(): if (PYTHON_VERSION_TRIPLE[:2] == (3, 6)) and not IS_PYPY: opc = get_opcode_module(sys.version_info) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_size(): if (sys.version_info == (3,6)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_size(): if (sys.version_info == (3, 6)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_jumps(): if (sys.version_info >= (2, 7)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(loop)) seen_pjif = False seen_ja = False for inst in instructions: if inst.opname == "POP_JUMP_IF_FALSE": assert inst.is_jump() seen_pjif = True elif inst.opname == "JUMP_ABSOLUTE": assert inst.is_jump() assert not inst.jumps_forward() seen_ja = True pass pass assert seen_pjif assert seen_ja
def test_if_in_for(): code = bug.__code__ scan = get_scanner(PYTHON_VERSION) print(PYTHON_VERSION) if 2.7 <= PYTHON_VERSION <= 3.0 and not IS_PYPY: n = scan.setup_code(code) scan.build_lines_data(code, n) scan.build_prev_op(n) fjt = scan.find_jump_targets(False) ## FIXME: the data below is wrong. ## we get different results currenty as well. ## We need to probably fix both the code ## and the test below # assert {15: [3], 69: [66], 63: [18]} == fjt # assert scan.structs == \ # [{'start': 0, 'end': 72, 'type': 'root'}, # {'start': 15, 'end': 66, 'type': 'if-then'}, # {'start': 31, 'end': 59, 'type': 'for-loop'}, # {'start': 62, 'end': 63, 'type': 'for-else'}] code = bug_loop.__code__ n = scan.setup_code(code) scan.build_lines_data(code, n) scan.build_prev_op(n) fjt = scan.find_jump_targets(False) assert{64: [42], 67: [42, 42], 42: [16, 41], 19: [6]} == fjt assert scan.structs == [ {'start': 0, 'end': 80, 'type': 'root'}, {'start': 3, 'end': 64, 'type': 'if-then'}, {'start': 6, 'end': 15, 'type': 'try'}, {'start': 19, 'end': 38, 'type': 'except'}, {'start': 45, 'end': 67, 'type': 'while-loop'}, {'start': 70, 'end': 64, 'type': 'while-else'}, # previous bug was not mistaking while-loop for if-then {'start': 48, 'end': 67, 'type': 'while-loop'}] elif 3.2 < PYTHON_VERSION <= 3.4: bytecode = Bytecode(code, scan.opc) scan.code = array('B', code.co_code) scan.build_lines_data(code) scan.build_prev_op() scan.insts = list(bytecode) fjt = scan.find_jump_targets(False) assert {69: [66], 63: [18]} == fjt assert scan.structs == \ [{'end': 72, 'type': 'root', 'start': 0}, {'end': 66, 'type': 'if-then', 'start': 6}, {'end': 63, 'type': 'if-then', 'start': 18}, {'end': 59, 'type': 'for-loop', 'start': 31}, {'end': 63, 'type': 'for-else', 'start': 62}] else: assert True, "FIXME: should note fixed" return
def test_inst_jumps(): if (sys.version_info >= (2, 7)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(loop)) seen_pjif = False seen_ja = False for inst in instructions: if inst.opname == "POP_JUMP_IF_FALSE": assert inst.is_jump() seen_pjif = True elif inst.opname == "JUMP_ABSOLUTE": assert inst.is_jump() assert not inst.jumps_forward() seen_ja = True pass pass assert seen_pjif # Python 3.10 code generation is more efficient and doesn't # and removes a JUMP_ABSOLUTE. if PYTHON_VERSION_TRIPLE < (3, 10): assert seen_ja
def number_loop(queue, mappings, opc): while len(queue) > 0: code1 = queue.popleft() code2 = queue.popleft() assert code1.co_name == code2.co_name linestarts_orig = findlinestarts(code1) linestarts_uncompiled = list(findlinestarts(code2)) mappings += [[line, offset2line(offset, linestarts_uncompiled)] for offset, line in linestarts_orig] bytecode1 = Bytecode(code1, opc) bytecode2 = Bytecode(code2, opc) instr2s = bytecode2.get_instructions(code2) seen = set([code1.co_name]) for instr in bytecode1.get_instructions(code1): next_code1 = None if iscode(instr.argval): next_code1 = instr.argval if next_code1: next_code2 = None while not next_code2: try: instr2 = next(instr2s) if iscode(instr2.argval): next_code2 = instr2.argval pass except StopIteration: break pass if next_code2: assert next_code1.co_name == next_code2.co_name if next_code1.co_name not in seen: seen.add(next_code1.co_name) queue.append(next_code1) queue.append(next_code2) pass pass pass pass
def build_instructions(self, co): """ Create a list of instructions (a structured object rather than an array of bytes) and store that in self.insts """ # FIXME: remove this when all subsidiary functions have been removed. # We should be able to get everything from the self.insts list. self.code = array("B", co.co_code) bytecode = Bytecode(co, self.opc) self.build_prev_op() self.insts = self.remove_extended_args(list(bytecode)) self.lines = self.build_lines_data(co) self.offset2inst_index = {} for i, inst in enumerate(self.insts): self.offset2inst_index[inst.offset] = i return bytecode
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The tranformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'both' if show_asm in ('both', 'before'): bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1; self.code = array('B', co.co_code) self.build_lines_data(co) self.build_prev_op() bytecode = Bytecode(co, self.opc) # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() bs = list(bytecode) n = len(bs) for i in range(n): inst = bs[i] # We need to detect the difference between # "raise AssertionError" and "assert" # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n: next_inst = bs[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): for j in range(i+2, n): raise_inst = bs[j] if raise_inst.opname.startswith('RAISE_VARARGS'): if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD': self.load_asserts.add(next_inst.offset) pass break pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets() for inst in bytecode: argval = inst.argval if inst.offset in jump_targets: jump_idx = 0 for jump_offset in jump_targets[inst.offset]: tokens.append(Token('COME_FROM', None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass pattr = inst.argrepr opname = inst.opname op = inst.opcode if opname in ['LOAD_CONST']: const = inst.argval if iscode(const): if const.co_name == '<lambda>': opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % [opname, annotate_args] pass opname = '%s_%d' % (opname, pos_args) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) tokens.append( Token( type_ = opname, attr = (pos_args, name_pair_args, annotate_args), pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = op_has_argument(op, op3), opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = inst.argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing inst.argval before_args = inst.argval & 0xFF after_args = (inst.argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = inst.argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset+3]] if (inst.offset in self.stmts and next_opname not in ('END_FINALLY', 'POP_BLOCK') and inst.offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations were we don't catch # CONTINUE as well. if tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' tokens.append( Token( type_ = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = (op >= op3.HAVE_ARGUMENT), opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns): """Produces disassembly in a format more conducive to automatic assembly by producing inner modules before they are used by outer ones. Since this is recusive, we'll use more stack space at runtime. """ if version < 3.0: co = code2compat(co) else: co = code3compat(co) co_name = co.co_name mapped_name = fn_name_map.get(co_name, co_name) new_consts = [] for c in co.co_consts: if iscode(c): if version < 3.0: c_compat = code2compat(c) else: c_compat = code3compat(c) disco_loop_asm_format(opc, version, c_compat, real_out, fn_name_map, all_fns) m = re.match(".* object <(.+)> at", str(c)) if m: basename = m.group(1) if basename != 'module': mapped_name = code_uniquify(basename, c.co_code) c_compat.co_name = mapped_name c_compat.freeze() new_consts.append(c_compat) else: new_consts.append(c) pass co.co_consts = new_consts m = re.match("^<(.+)>$", co.co_name) if m or co_name in all_fns: if co_name in all_fns: basename = co_name else: basename = m.group(1) if basename != 'module': mapped_name = code_uniquify(basename, co.co_code) co_name = mapped_name assert mapped_name not in fn_name_map fn_name_map[mapped_name] = basename co.co_name = mapped_name pass elif co_name in fn_name_map: # FIXME: better would be a hash of the co_code mapped_name = code_uniquify(co_name, co.co_code) fn_name_map[mapped_name] = co_name co.co_name = mapped_name pass co = co.freeze() all_fns.add(co_name) if co.co_name != '<module>' or co.co_filename: real_out.write("\n" + format_code_info(co, version, mapped_name) + "\n") bytecode = Bytecode(co, opc, dup_lines=True) real_out.write(bytecode.dis(asm_format=True) + "\n")
def _dis_to_text(co): return Bytecode(co).dis()
def dis(msg, msg_nocr, section, errmsg, x=None, start_line=-1, end_line=None, relative_pos = False, highlight='light', start_offset=0, end_offset=None, include_header=False): """Disassemble classes, methods, functions, or code. With no argument, disassemble the last traceback. """ lasti = -1 if x is None: distb() return None, None if start_offset is None: start_offset = 0 mess = '' if start_line > 1: mess += "from line %d " % start_line elif start_offset > 1: mess = "from offset %d " % start_offset if end_line: mess += "to line %d" % end_line elif end_offset: mess += "to offset %d" % end_offset sectioned = False # Try to dogpaddle to the code object for the type setting x if hasattr(types, 'InstanceType') and isinstance(x, types.InstanceType): x = x.__class__ if inspect.ismethod(x): section("Disassembly of %s: %s" % (x, mess)) sectioned = True x = x.im_func elif inspect.isfunction(x) or inspect.isgeneratorfunction(x): section("Disassembly of %s: %s" % (x, mess)) x = x.func_code sectioned = True elif inspect.isgenerator(x): section("Disassembly of %s: %s" % (x, mess)) frame = x.gi_frame lasti = frame.f_last_i x = x.gi_code sectioned = True elif inspect.isframe(x): section("Disassembly of %s: %s" % (x, mess)) sectioned = True if hasattr(x, 'f_lasti'): lasti = x.f_lasti if lasti == -1: lasti = 0 pass opc = get_opcode(PYTHON_VERSION, IS_PYPY) x = x.f_code if include_header: header_lines = Bytecode(x, opc).info().split("\n") header = '\n'.join([format_token(Mformat.Comment, h) for h in header_lines]) msg(header) pass elif inspect.iscode(x): pass if hasattr(x, '__dict__'): # Class or module items = sorted(x.__dict__.items()) for name, x1 in items: if isinstance(x1, _have_code): if not sectioned: section("Disassembly of %s: " % x) try: dis(msg, msg_nocr, section, errmsg, x1, start_line=start_line, end_line=end_line, relative_pos = relative_pos) msg("") except TypeError: _, msg, _ = sys.exc_info() errmsg("Sorry:", msg) pass pass pass pass elif hasattr(x, 'co_code'): # Code object if not sectioned: section("Disassembly of %s: " % x) return disassemble(msg, msg_nocr, section, x, lasti=lasti, start_line=start_line, end_line=end_line, relative_pos = relative_pos, highlight = highlight, start_offset = start_offset, end_offset = end_offset) elif isinstance(x, str): # Source code return disassemble_string(msg, msg_nocr, x,) else: errmsg("Don't know how to disassemble %s objects." % type(x).__name__) return None, None
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut codelen = self.setup_code(co) self.build_lines_data(co, codelen) self.build_prev_op(codelen) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i + 3] == self.opc.POP_TOP and self.code[i + 4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i + 4)] == 'AssertionError': self.load_asserts.add(i + 4) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None pattr = None if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. last_jump_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): if jump_offset != last_jump_offset: tokens.append( Token('COME_FROM', None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True)) jump_idx += 1 last_jump_offset = jump_offset elif offset in self.thens: tokens.append( Token('THEN', None, self.thens[offset], offset="%s_0" % offset, has_arg=True)) has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * L65536 continue if op in self.opc.CONST_OPS: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.NAME_OPS: pattr = names[oparg] elif op in self.opc.JREL_OPS: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.JABS_OPS: pattr = repr(oparg) elif op in self.opc.LOCAL_OPS: pattr = varnames[oparg] elif op in self.opc.COMPARE_OPS: pattr = self.opc.cmp_op[oparg] elif op in self.opc.FREE_OPS: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = 'JUMP_BACK' if (offset in self.stmts and self.code[offset + 3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)): if ((offset in self.linestartoffsets and tokens[-1].type == 'JUMP_BACK') or offset not in self.not_continue): op_name = 'CONTINUE' else: # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': # We need 'intern' since we have # already have processed the previous # token. tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append( Token(op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append( Token(replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut codelen = self.setup_code(co) self.build_lines_data(co, codelen) self.build_prev_op(codelen) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i+3] == self.opc.POP_TOP and self.code[i+4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i+4)] == 'AssertionError': self.load_asserts.add(i+4) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None; pattr = None if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. last_jump_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): if jump_offset != last_jump_offset: tokens.append(Token( 'COME_FROM', None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg = True)) jump_idx += 1 last_jump_offset = jump_offset elif offset in self.thens: tokens.append(Token( 'THEN', None, self.thens[offset], offset="%s_0" % offset, has_arg = True)) has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: raise NotImplementedError extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = 'JUMP_BACK' if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)): if ((offset in self.linestartoffsets and tokens[-1].type == 'JUMP_BACK') or offset not in self.not_continue): op_name = 'CONTINUE' else: # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': # We need 'intern' since we have # already have processed the previous # token. tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between: # raise AssertionError # and # assert ... # Below we use the heuristic that it is preceded by a POP_JUMP. # however we could also use followed by RAISE_VARARGS # or for PyPy there may be a JUMP_IF_NOT_DEBUG before. # FIXME: remove uses of PJIF, and PJIT if self.is_pypy: have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT) else: have_pop_jump = self.code[i] == self.opc.PJIT if have_pop_jump and self.code[i + 3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i + 3)] == 'AssertionError': self.load_asserts.add(i + 3) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. # last_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): # if jump_offset == last_offset: # continue # last_offset = jump_offset come_from_name = 'COME_FROM' op_name = self.opc.opname[self.code[jump_offset]] if op_name.startswith('SETUP_') and self.version == 2.7: come_from_type = op_name[len('SETUP_'):] if come_from_type not in ('LOOP', 'EXCEPT'): come_from_name = 'COME_FROM_%s' % come_from_type pass tokens.append( Token(come_from_name, None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True)) jump_idx += 1 op = self.code[offset] op_name = self.opc.opname[op] oparg = None pattr = None has_arg = op_has_argument(op, self.opc) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: if self.is_pypy and not oparg and op_name == 'BUILD_MAP': op_name = 'BUILD_MAP_n' else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif self.is_pypy and op_name in ('LOOKUP_METHOD', 'JUMP_IF_NOT_DEBUG', 'SETUP_EXCEPT', 'SETUP_FINALLY'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[op_name] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset + 3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): op_name = 'CONTINUE' else: op_name = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append( Token(op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append( Token(replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def disco_loop_asm_format(opc, version, co, real_out, fn_name_map, all_fns): """Produces disassembly in a format more conducive to automatic assembly by producing inner modules before they are used by outer ones. Since this is recusive, we'll use more stack space at runtime. """ co = codeType2Portable(co) co_name = co.co_name mapped_name = fn_name_map.get(co_name, co_name) new_consts = [] for c in co.co_consts: if iscode(c): if isinstance(c, types.CodeType): c_compat = codeType2Portable(c) else: c_compat = c disco_loop_asm_format(opc, version, c_compat, real_out, fn_name_map, all_fns) m = re.match(".* object <(.+)> at", str(c)) if m: basename = m.group(1) if basename != "module": mapped_name = code_uniquify(basename, c.co_code) c_compat.co_name = mapped_name c_compat.freeze() new_consts.append(c_compat) else: new_consts.append(c) pass co.co_consts = new_consts m = re.match("^<(.+)>$", co.co_name) if m or co_name in all_fns: if co_name in all_fns: basename = co_name else: basename = m.group(1) if basename != "module": mapped_name = code_uniquify(basename, co.co_code) co_name = mapped_name assert mapped_name not in fn_name_map fn_name_map[mapped_name] = basename co.co_name = mapped_name pass elif co_name in fn_name_map: # FIXME: better would be a hash of the co_code mapped_name = code_uniquify(co_name, co.co_code) fn_name_map[mapped_name] = co_name co.co_name = mapped_name pass co = co.freeze() all_fns.add(co_name) if co.co_name != "<module>" or co.co_filename: real_out.write("\n" + format_code_info(co, version, mapped_name) + "\n") bytecode = Bytecode(co, opc, dup_lines=True) real_out.write(bytecode.dis(asm_format="asm") + "\n")
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Disassemble a Python 2 code object, returning a list of 'Token'. Various tranformations are made to assist the deparsing grammar. For example: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional aruments The main part of this procedure is modelled after dis.disassemble(). """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'before' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # self.lines contains (block,addrLastInstr) if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between # "raise AssertionError" and # "assert" if self.code[i] == self.opc.PJIT and self.code[i+3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i+3)] == 'AssertionError': self.load_asserts.add(i+3) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n-1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token( 'COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 op = self.code[offset] opname = self.opc.opname[op] oparg = None; pattr = None has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: opname = '%s_%d' % (opname, oparg) if op != self.opc.BUILD_SLICE: customize[opname] = oparg elif op == self.opc.JUMP_ABSOLUTE: target = self.get_target(offset) if target < offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: opname = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: opname = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( opname, oparg, pattr, offset, linestart, op, has_arg)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format()) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 self.code = array('B', co.co_code) self.build_lines_data(co) self.build_prev_op() bytecode = Bytecode(co, self.opc) # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() bs = list(bytecode) n = len(bs) for i in range(n): inst = bs[i] # We need to detect the difference between # "raise AssertionError" and "assert" # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n: next_inst = bs[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): for j in range(i+2, n): raise_inst = bs[j] if raise_inst.opname.startswith('RAISE_VARARGS'): if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD': self.load_asserts.add(next_inst.offset) pass break pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) for inst in bytecode: argval = inst.argval if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opName(jump_offset) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass tokens.append(Token(come_from_name, None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append(Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg = True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname op = inst.opcode if opname in ['LOAD_CONST']: const = inst.argval if iscode(const): if const.co_name == '<lambda>': opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) tokens.append( Token( type_ = opname, attr = (pos_args, name_pair_args, annotate_args), pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = op_has_argument(op, op3), opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = inst.argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing inst.argval before_args = inst.argval & 0xFF after_args = (inst.argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = inst.argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset+3]] if (inst.offset in self.stmts and next_opname not in ('END_FINALLY', 'POP_BLOCK', # Python 3.0 only uses POP_TOP 'POP_TOP') and inst.offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].type == 'JUMP_BACK' and tokens[-1].attr <= argval: # intern is used because we are changing the *previous* token tokens[-1].type = intern('CONTINUE') elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' tokens.append( Token( type_ = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = (op >= op3.HAVE_ARGUMENT), opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The tranformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'before' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1; Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # self.lines contains (block,addrLastInstr) if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between: # raise AssertionError # and # assert ... # Below we use the heuristic that it is preceded by a POP_JUMP. # however we could also use followed by RAISE_VARARGS # or for PyPy there may be a JUMP_IF_NOT_DEBUG before. # FIXME: remove uses of PJIF, and PJIT if self.is_pypy: have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT) else: have_pop_jump = self.code[i] == self.opc.PJIT if have_pop_jump and self.code[i+3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i+3)] == 'AssertionError': self.load_asserts.add(i+3) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n-1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token( 'COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 op = self.code[offset] opname = self.opc.opname[op] oparg = None; pattr = None has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: if self.is_pypy and not oparg and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, oparg) if op != self.opc.BUILD_SLICE: customize[opname] = oparg elif self.is_pypy and opname in ('LOOKUP_METHOD', 'JUMP_IF_NOT_DEBUG', 'SETUP_EXCEPT', 'SETUP_FINALLY'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: opname = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: opname = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( opname, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): ''' Disassemble a code object, returning a list of 'Token'. The main part of this procedure is modelled after dis.disassemble(). ''' show_asm = self.show_asm if not show_asm else show_asm if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # from xdis.bytecode import Bytecode # bytecode = Bytecode(co, self.opc) # for instr in bytecode.get_instructions(co): # print(instr._disassemble()) # Container for tokens tokens = [] customize = {} Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # linestarts contains block code adresses (addr,block) self.linestarts = list(findlinestarts(co)) # class and names if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names codelen = len(self.code) # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between # "raise AssertionError" and # "assert" if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i+3] == self.opc.POP_TOP and self.code[i+4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i+4)] == 'AssertionError': self.load_asserts.add(i+4) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] imports = self.all_instr(0, codelen, (self.opc.IMPORT_NAME, self.opc.IMPORT_FROM, self.opc.IMPORT_STAR)) # Changes IMPORT_NAME to IMPORT_NAME_CONT. # Possibly a Python 2.0 hangover if len(imports) > 1 and self.version < 2.3: last_import = imports[0] for i in imports[1:]: if self.lines[last_import].next > i: if self.code[last_import] == self.opc.IMPORT_NAME == self.code[i]: replace[i] = 'IMPORT_NAME_CONT' last_import = i extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None; pattr = None if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token('COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: raise NotImplementedError extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classifhy JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): op_name = 'CONTINUE' else: op_name = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( op_name, oparg, pattr, offset, linestart, op, has_arg)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg)) pass pass if show_asm: for t in tokens: print(t) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - some EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ # FIXME: remove this when all subsidiary functions have been removed. # We should be able to get everything from the self.insts list. self.code = array('B', co.co_code) bytecode = Bytecode(co, self.opc) show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'both' if show_asm in ('both', 'before'): for instr in bytecode.get_instructions(co): print(instr.disassemble()) # list of tokens/instructions tokens = [] # "customize" is a dict whose keys are nonterminals # and the value is the argument stack entries for that # nonterminal. The count is a little hoaky. It is mostly # not used, but sometimes it is. # "customize" is a dict whose keys are nonterminals customize = {} if self.is_pypy: customize['PyPy'] = 0 self.build_lines_data(co) self.build_prev_op() # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() self.insts = list(bytecode) n = len(self.insts) for i, inst in enumerate(self.insts): # We need to detect the difference between: # raise AssertionError # and # assert ... # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i + 1 < n: next_inst = self.insts[i + 1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): if (i + 2 < n and self.insts[i + 2].opname.startswith('RAISE_VARARGS')): self.load_asserts.add(next_inst.offset) pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) # print("XXX2", jump_targets) last_op_was_break = False for i, inst in enumerate(bytecode): argval = inst.argval op = inst.opcode if op == self.opc.EXTENDED_ARG: # FIXME: The EXTENDED_ARG is used to signal annotation # parameters if self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION: continue if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opname_for_offset(jump_offset) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass elif inst.offset in self.except_targets: come_from_name = 'COME_FROM_EXCEPT_CLAUSE' tokens.append( Token(come_from_name, None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg=True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append( Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg=True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname if op in self.opc.CONST_OPS: const = argval if iscode(const): if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): if self.version >= 3.6: # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION' flags = argval opname = 'MAKE_FUNCTION_%d' % (flags) attr = [] for flag in self.MAKE_FUNCTION_FLAGS: bit = flags & 1 if bit: if pattr: pattr += ", " + flag else: pattr += flag attr.append(bit) flags >>= 1 attr = attr[:4] # remove last value: attr[5] == False else: pos_args, name_pair_args, annotate_args = parse_fn_counts( inst.argval) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) attr = (pos_args, name_pair_args, annotate_args) tokens.append( Token(opname=opname, attr=attr, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc)) continue elif op in self.varargs_ops: pos_args = argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing argval before_args = argval & 0xFF after_args = (argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = argval # FIXME: 0 isn't always correct target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset + 3]] if (inst.offset in self.stmts and (self.version != 3.0 or (hasattr(inst, 'linestart'))) and (next_opname not in ( 'END_FINALLY', 'POP_BLOCK', # Python 3.0 only uses POP_TOP 'POP_TOP'))): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].kind == 'JUMP_BACK' and tokens[ -1].attr <= argval: if tokens[-2].kind == 'BREAK_LOOP': del tokens[-1] else: # intern is used because we are changing the *previous* token tokens[-1].kind = intern('CONTINUE') if last_op_was_break and opname == 'CONTINUE': last_op_was_break = False continue # FIXME: go over for Python 3.6+. This is sometimes wrong elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' last_op_was_break = opname == 'BREAK_LOOP' tokens.append( Token(opname=opname, attr=argval, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc)) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize