def number_loop(queue, mappings, opc): while len(queue) > 0: code1 = queue.popleft() code2 = queue.popleft() assert code1.co_name == code2.co_name linestarts_orig = findlinestarts(code1) linestarts_uncompiled = list(findlinestarts(code2)) mappings += [[line, offset2line(offset, linestarts_uncompiled)] for offset, line in linestarts_orig] bytecode1 = Bytecode(code1, opc) bytecode2 = Bytecode(code2, opc) instr2s = bytecode2.get_instructions(code2) seen = set([code1.co_name]) for instr in bytecode1.get_instructions(code1): next_code1 = None if iscode(instr.argval): next_code1 = instr.argval if next_code1: next_code2 = None while not next_code2: try: instr2 = next(instr2s) if iscode(instr2.argval): next_code2 = instr2.argval pass except StopIteration: break pass if next_code2: assert next_code1.co_name == next_code2.co_name if next_code1.co_name not in seen: seen.add(next_code1.co_name) queue.append(next_code1) queue.append(next_code2) pass pass pass pass
def number_loop(queue, mappings, opc): while len(queue) > 0: code1 = queue.popleft() code2 = queue.popleft() assert code1.co_name == code2.co_name linestarts_orig = findlinestarts(code1) linestarts_uncompiled = list(findlinestarts(code2)) mappings += [[line, offset2line(offset, linestarts_uncompiled)] for offset, line in linestarts_orig] bytecode1 = Bytecode(code1, opc) bytecode2 = Bytecode(code2, opc) instr2s = bytecode2.get_instructions(code2) seen = set([code1.co_name]) for instr in bytecode1.get_instructions(code1): next_code1 = None if iscode(instr.argval): next_code1 = instr.argval if next_code1: next_code2 = None while not next_code2: try: instr2 = next(instr2s) if iscode(instr2.argval): next_code2 = instr2.argval pass except StopIteration: break pass if next_code2: assert next_code1.co_name == next_code2.co_name if next_code1.co_name not in seen: seen.add(next_code1.co_name) queue.append(next_code1) queue.append(next_code2) pass pass pass pass
def test_inst_size(): if (PYTHON_VERSION_TRIPLE[:2] == (3, 6)) and not IS_PYPY: opc = get_opcode_module(sys.version_info) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_jumps(): if (sys.version_info >= (2, 7)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(loop)) seen_pjif = False seen_ja = False for inst in instructions: if inst.opname == "POP_JUMP_IF_FALSE": assert inst.is_jump() seen_pjif = True elif inst.opname == "JUMP_ABSOLUTE": assert inst.is_jump() assert not inst.jumps_forward() seen_ja = True pass pass assert seen_pjif assert seen_ja
def test_inst_size(): if (sys.version_info == (3, 6)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_size(): if (sys.version_info == (3,6)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(extended_arg_fn36)) inst1 = instructions[1] assert inst1.opname == 'EXTENDED_ARG' assert inst1.argval == 0 inst2 = instructions[2] assert inst2.opname == 'POP_JUMP_IF_FALSE' assert inst2.has_extended_arg == True assert inst2.inst_size == 4 # for inst in instructions: # print(inst) else: assert True
def test_inst_jumps(): if (sys.version_info >= (2, 7)): variant = 'pypy' if IS_PYPY else None opc = get_opcode_module(sys.version_info, variant) bytecode_obj = Bytecode(extended_arg_fn36, opc) instructions = list(bytecode_obj.get_instructions(loop)) seen_pjif = False seen_ja = False for inst in instructions: if inst.opname == "POP_JUMP_IF_FALSE": assert inst.is_jump() seen_pjif = True elif inst.opname == "JUMP_ABSOLUTE": assert inst.is_jump() assert not inst.jumps_forward() seen_ja = True pass pass assert seen_pjif # Python 3.10 code generation is more efficient and doesn't # and removes a JUMP_ABSOLUTE. if PYTHON_VERSION_TRIPLE < (3, 10): assert seen_ja
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): ''' Disassemble a code object, returning a list of 'Token'. The main part of this procedure is modelled after dis.disassemble(). ''' show_asm = self.show_asm if not show_asm else show_asm if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # from xdis.bytecode import Bytecode # bytecode = Bytecode(co, self.opc) # for instr in bytecode.get_instructions(co): # print(instr._disassemble()) # Container for tokens tokens = [] customize = {} Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # linestarts contains block code adresses (addr,block) self.linestarts = list(findlinestarts(co)) # class and names if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names codelen = len(self.code) # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between # "raise AssertionError" and # "assert" if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i+3] == self.opc.POP_TOP and self.code[i+4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i+4)] == 'AssertionError': self.load_asserts.add(i+4) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] imports = self.all_instr(0, codelen, (self.opc.IMPORT_NAME, self.opc.IMPORT_FROM, self.opc.IMPORT_STAR)) # Changes IMPORT_NAME to IMPORT_NAME_CONT. # Possibly a Python 2.0 hangover if len(imports) > 1 and self.version < 2.3: last_import = imports[0] for i in imports[1:]: if self.lines[last_import].next > i: if self.code[last_import] == self.opc.IMPORT_NAME == self.code[i]: replace[i] = 'IMPORT_NAME_CONT' last_import = i extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None; pattr = None if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token('COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: raise NotImplementedError extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classifhy JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): op_name = 'CONTINUE' else: op_name = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( op_name, oparg, pattr, offset, linestart, op, has_arg)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg)) pass pass if show_asm: for t in tokens: print(t) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between: # raise AssertionError # and # assert ... # Below we use the heuristic that it is preceded by a POP_JUMP. # however we could also use followed by RAISE_VARARGS # or for PyPy there may be a JUMP_IF_NOT_DEBUG before. # FIXME: remove uses of PJIF, and PJIT if self.is_pypy: have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT) else: have_pop_jump = self.code[i] == self.opc.PJIT if have_pop_jump and self.code[i + 3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i + 3)] == 'AssertionError': self.load_asserts.add(i + 3) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. # last_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): # if jump_offset == last_offset: # continue # last_offset = jump_offset come_from_name = 'COME_FROM' op_name = self.opc.opname[self.code[jump_offset]] if op_name.startswith('SETUP_') and self.version == 2.7: come_from_type = op_name[len('SETUP_'):] if come_from_type not in ('LOOP', 'EXCEPT'): come_from_name = 'COME_FROM_%s' % come_from_type pass tokens.append( Token(come_from_name, None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True)) jump_idx += 1 op = self.code[offset] op_name = self.opc.opname[op] oparg = None pattr = None has_arg = op_has_argument(op, self.opc) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: if self.is_pypy and not oparg and op_name == 'BUILD_MAP': op_name = 'BUILD_MAP_n' else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif self.is_pypy and op_name in ('LOOKUP_METHOD', 'JUMP_IF_NOT_DEBUG', 'SETUP_EXCEPT', 'SETUP_FINALLY'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[op_name] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset + 3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): op_name = 'CONTINUE' else: op_name = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append( Token(op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append( Token(replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 self.code = array('B', co.co_code) self.build_lines_data(co) self.build_prev_op() bytecode = Bytecode(co, self.opc) # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() bs = list(bytecode) n = len(bs) for i in range(n): inst = bs[i] # We need to detect the difference between # "raise AssertionError" and "assert" # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n: next_inst = bs[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): for j in range(i+2, n): raise_inst = bs[j] if raise_inst.opname.startswith('RAISE_VARARGS'): if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD': self.load_asserts.add(next_inst.offset) pass break pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) for inst in bytecode: argval = inst.argval if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opName(jump_offset) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass tokens.append(Token(come_from_name, None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append(Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg = True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname op = inst.opcode if opname in ['LOAD_CONST']: const = inst.argval if iscode(const): if const.co_name == '<lambda>': opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) tokens.append( Token( type_ = opname, attr = (pos_args, name_pair_args, annotate_args), pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = op_has_argument(op, op3), opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = inst.argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing inst.argval before_args = inst.argval & 0xFF after_args = (inst.argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = inst.argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset+3]] if (inst.offset in self.stmts and next_opname not in ('END_FINALLY', 'POP_BLOCK', # Python 3.0 only uses POP_TOP 'POP_TOP') and inst.offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].type == 'JUMP_BACK' and tokens[-1].attr <= argval: # intern is used because we are changing the *previous* token tokens[-1].type = intern('CONTINUE') elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' tokens.append( Token( type_ = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = (op >= op3.HAVE_ARGUMENT), opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut codelen = self.setup_code(co) self.build_lines_data(co, codelen) self.build_prev_op(codelen) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i + 3] == self.opc.POP_TOP and self.code[i + 4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i + 4)] == 'AssertionError': self.load_asserts.add(i + 4) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None pattr = None if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. last_jump_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): if jump_offset != last_jump_offset: tokens.append( Token('COME_FROM', None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True)) jump_idx += 1 last_jump_offset = jump_offset elif offset in self.thens: tokens.append( Token('THEN', None, self.thens[offset], offset="%s_0" % offset, has_arg=True)) has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * L65536 continue if op in self.opc.CONST_OPS: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.NAME_OPS: pattr = names[oparg] elif op in self.opc.JREL_OPS: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.JABS_OPS: pattr = repr(oparg) elif op in self.opc.LOCAL_OPS: pattr = varnames[oparg] elif op in self.opc.COMPARE_OPS: pattr = self.opc.cmp_op[oparg] elif op in self.opc.FREE_OPS: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = 'JUMP_BACK' if (offset in self.stmts and self.code[offset + 3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)): if ((offset in self.linestartoffsets and tokens[-1].type == 'JUMP_BACK') or offset not in self.not_continue): op_name = 'CONTINUE' else: # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': # We need 'intern' since we have # already have processed the previous # token. tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append( Token(op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append( Token(replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The tranformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'before' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1; Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # self.lines contains (block,addrLastInstr) if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between: # raise AssertionError # and # assert ... # Below we use the heuristic that it is preceded by a POP_JUMP. # however we could also use followed by RAISE_VARARGS # or for PyPy there may be a JUMP_IF_NOT_DEBUG before. # FIXME: remove uses of PJIF, and PJIT if self.is_pypy: have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT) else: have_pop_jump = self.code[i] == self.opc.PJIT if have_pop_jump and self.code[i+3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i+3)] == 'AssertionError': self.load_asserts.add(i+3) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n-1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token( 'COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 op = self.code[offset] opname = self.opc.opname[op] oparg = None; pattr = None has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: if self.is_pypy and not oparg and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, oparg) if op != self.opc.BUILD_SLICE: customize[opname] = oparg elif self.is_pypy and opname in ('LOOKUP_METHOD', 'JUMP_IF_NOT_DEBUG', 'SETUP_EXCEPT', 'SETUP_FINALLY'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: opname = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: opname = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( opname, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The tranformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'both' if show_asm in ('both', 'before'): bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1; self.code = array('B', co.co_code) self.build_lines_data(co) self.build_prev_op() bytecode = Bytecode(co, self.opc) # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() bs = list(bytecode) n = len(bs) for i in range(n): inst = bs[i] # We need to detect the difference between # "raise AssertionError" and "assert" # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n: next_inst = bs[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): for j in range(i+2, n): raise_inst = bs[j] if raise_inst.opname.startswith('RAISE_VARARGS'): if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD': self.load_asserts.add(next_inst.offset) pass break pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets() for inst in bytecode: argval = inst.argval if inst.offset in jump_targets: jump_idx = 0 for jump_offset in jump_targets[inst.offset]: tokens.append(Token('COME_FROM', None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass pattr = inst.argrepr opname = inst.opname op = inst.opcode if opname in ['LOAD_CONST']: const = inst.argval if iscode(const): if const.co_name == '<lambda>': opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % [opname, annotate_args] pass opname = '%s_%d' % (opname, pos_args) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) tokens.append( Token( type_ = opname, attr = (pos_args, name_pair_args, annotate_args), pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = op_has_argument(op, op3), opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = inst.argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing inst.argval before_args = inst.argval & 0xFF after_args = (inst.argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = inst.argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset+3]] if (inst.offset in self.stmts and next_opname not in ('END_FINALLY', 'POP_BLOCK') and inst.offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations were we don't catch # CONTINUE as well. if tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' tokens.append( Token( type_ = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = (op >= op3.HAVE_ARGUMENT), opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def disassemble(self, co, classname=None, code_objects={}, show_asm=None): """ Disassemble a Python 2 code object, returning a list of 'Token'. Various tranformations are made to assist the deparsing grammar. For example: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional aruments The main part of this procedure is modelled after dis.disassemble(). """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'before' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} Token = self.Token # shortcut n = self.setup_code(co) self.build_lines_data(co, n) self.build_prev_op(n) # self.lines contains (block,addrLastInstr) if classname: classname = '_' + classname.lstrip('_') + '__' def unmangle(name): if name.startswith(classname) and name[-2:] != '__': return name[len(classname) - 2:] return name free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ] names = [ unmangle(name) for name in co.co_names ] varnames = [ unmangle(name) for name in co.co_varnames ] else: free = co.co_cellvars + co.co_freevars names = co.co_names varnames = co.co_varnames self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, n): # We need to detect the difference between # "raise AssertionError" and # "assert" if self.code[i] == self.opc.PJIT and self.code[i+3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i+3)] == 'AssertionError': self.load_asserts.add(i+3) cf = self.find_jump_targets() # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < n-1: if self.lines[last_stmt].next > i: if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, n): if offset in cf: k = 0 for j in cf[offset]: tokens.append(Token( 'COME_FROM', None, repr(j), offset="%s_%d" % (offset, k), has_arg = True)) k += 1 op = self.code[offset] opname = self.opc.opname[op] oparg = None; pattr = None has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: opname = '%s_%d' % (opname, oparg) if op != self.opc.BUILD_SLICE: customize[opname] = oparg elif op == self.opc.JUMP_ABSOLUTE: target = self.get_target(offset) if target < offset: if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK) and offset not in self.not_continue): opname = 'CONTINUE' else: opname = 'JUMP_BACK' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: opname = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: opname = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( opname, oparg, pattr, offset, linestart, op, has_arg)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format()) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # Container for tokens tokens = [] customize = {} if self.is_pypy: customize['PyPy'] = 1 Token = self.Token # shortcut codelen = self.setup_code(co) self.build_lines_data(co, codelen) self.build_prev_op(codelen) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen and self.code[i+3] == self.opc.POP_TOP and self.code[i+4] == self.opc.LOAD_GLOBAL): if names[self.get_argument(i+4)] == 'AssertionError': self.load_asserts.add(i+4) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, codelen): op = self.code[offset] op_name = self.opname[op] oparg = None; pattr = None if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. last_jump_offset = -1 for jump_offset in sorted(jump_targets[offset], reverse=True): if jump_offset != last_jump_offset: tokens.append(Token( 'COME_FROM', None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg = True)) jump_idx += 1 last_jump_offset = jump_offset elif offset in self.thens: tokens.append(Token( 'THEN', None, self.thens[offset], offset="%s_0" % offset, has_arg = True)) has_arg = (op >= self.opc.HAVE_ARGUMENT) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: raise NotImplementedError extended_arg = oparg * scan.L65536 continue if op in self.opc.hasconst: const = co.co_consts[oparg] # We can't use inspect.iscode() because we may be # using a different version of Python than the # one that this was byte-compiled on. So the code # types may mismatch. if hasattr(const, 'co_name'): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == self.genexpr_name: op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' % \ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.hasname: pattr = names[oparg] elif op in self.opc.hasjrel: pattr = repr(offset + 3 + oparg) if op == self.opc.JUMP_FORWARD: target = self.get_target(offset) # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if len(tokens) and tokens[-1].type == 'JUMP_BACK': tokens[-1].type = intern('CONTINUE') elif op in self.opc.hasjabs: pattr = repr(oparg) elif op in self.opc.haslocal: pattr = varnames[oparg] elif op in self.opc.hascompare: pattr = self.opc.cmp_op[oparg] elif op in self.opc.hasfree: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE): continue else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = 'JUMP_BACK' if (offset in self.stmts and self.code[offset+3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)): if ((offset in self.linestartoffsets and tokens[-1].type == 'JUMP_BACK') or offset not in self.not_continue): op_name = 'CONTINUE' else: # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. if tokens[-1].type == 'JUMP_BACK': # We need 'intern' since we have # already have processed the previous # token. tokens[-1].type = intern('CONTINUE') elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append(Token( op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append(Token( replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - some EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ # FIXME: remove this when all subsidiary functions have been removed. # We should be able to get everything from the self.insts list. self.code = array('B', co.co_code) bytecode = Bytecode(co, self.opc) show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'both' if show_asm in ('both', 'before'): for instr in bytecode.get_instructions(co): print(instr.disassemble()) # list of tokens/instructions tokens = [] # "customize" is a dict whose keys are nonterminals # and the value is the argument stack entries for that # nonterminal. The count is a little hoaky. It is mostly # not used, but sometimes it is. # "customize" is a dict whose keys are nonterminals customize = {} if self.is_pypy: customize['PyPy'] = 0 self.build_lines_data(co) self.build_prev_op() # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() self.insts = list(bytecode) n = len(self.insts) for i, inst in enumerate(self.insts): # We need to detect the difference between: # raise AssertionError # and # assert ... # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i + 1 < n: next_inst = self.insts[i + 1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): if (i + 2 < n and self.insts[i + 2].opname.startswith('RAISE_VARARGS')): self.load_asserts.add(next_inst.offset) pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) # print("XXX2", jump_targets) last_op_was_break = False for i, inst in enumerate(bytecode): argval = inst.argval op = inst.opcode if op == self.opc.EXTENDED_ARG: # FIXME: The EXTENDED_ARG is used to signal annotation # parameters if self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION: continue if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opname_for_offset(jump_offset) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass elif inst.offset in self.except_targets: come_from_name = 'COME_FROM_EXCEPT_CLAUSE' tokens.append( Token(come_from_name, None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg=True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append( Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg=True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname if op in self.opc.CONST_OPS: const = argval if iscode(const): if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): if self.version >= 3.6: # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION' flags = argval opname = 'MAKE_FUNCTION_%d' % (flags) attr = [] for flag in self.MAKE_FUNCTION_FLAGS: bit = flags & 1 if bit: if pattr: pattr += ", " + flag else: pattr += flag attr.append(bit) flags >>= 1 attr = attr[:4] # remove last value: attr[5] == False else: pos_args, name_pair_args, annotate_args = parse_fn_counts( inst.argval) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) attr = (pos_args, name_pair_args, annotate_args) tokens.append( Token(opname=opname, attr=attr, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc)) continue elif op in self.varargs_ops: pos_args = argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing argval before_args = argval & 0xFF after_args = (argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = argval # FIXME: 0 isn't always correct target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset + 3]] if (inst.offset in self.stmts and (self.version != 3.0 or (hasattr(inst, 'linestart'))) and (next_opname not in ( 'END_FINALLY', 'POP_BLOCK', # Python 3.0 only uses POP_TOP 'POP_TOP'))): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].kind == 'JUMP_BACK' and tokens[ -1].attr <= argval: if tokens[-2].kind == 'BREAK_LOOP': del tokens[-1] else: # intern is used because we are changing the *previous* token tokens[-1].kind = intern('CONTINUE') if last_op_was_break and opname == 'CONTINUE': last_op_was_break = False continue # FIXME: go over for Python 3.6+. This is sometimes wrong elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' last_op_was_break = opname == 'BREAK_LOOP' tokens.append( Token(opname=opname, attr=argval, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc)) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize