def unpack_opargs_wordcode(code, opc): extended_arg = 0 try: n = len(code) except TypeError: code = code.co_code n = len(code) if isinstance(code[0], str): # This happens handling Python 3.x on a 2.x interpreter for i in range(0, n, 2): op = ord(code[i]) if op_has_argument(op, opc): arg = ord(code[i+1]) | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg) else: for i in range(0, n, 2): op = code[i] if op_has_argument(op, opc): arg = code[i+1] | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg)
def unpack_opargs_wordcode(code, opc): extended_arg = 0 try: n = len(code) except TypeError: code = code.co_code n = len(code) if isinstance(code[0], str): # This happens handling Python 3.x on a 2.x interpreter for i in range(0, n, 2): op = ord(code[i]) if op_has_argument(op, opc): arg = ord(code[i + 1]) | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg) else: for i in range(0, n, 2): op = code[i] if op_has_argument(op, opc): arg = code[i + 1] | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg)
def build_prev_op(self, n): self.prev = [0] # mapping addresses of instruction & argument for i in self.op_range(0, n): op = self.code[i] self.prev.append(i) if op_has_argument(op, self.opc): self.prev.append(i) self.prev.append(i) pass pass
def unpack_opargs_wordcode(code, opc): extended_arg = 0 for i in range(0, len(code), 2): op = code[i] if op_has_argument(op, opc): if isinstance(code[i + 1], str): arg = ord(code[i + 1]) | extended_arg else: arg = code[i + 1] | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg)
def unpack_opargs_wordcode(code, opc): extended_arg = 0 for i in range(0, len(code), 2): op = code[i] if op_has_argument(op, opc): if isinstance(code[i+1], str): arg = ord(code[i+1]) | extended_arg else: arg = code[i+1] | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg)
def unpack_opargs_wordcode(code, opc): extended_arg = 0 try: n = len(code) except TypeError: code = code.co_code n = len(code) for i in range(0, n, 2): op = code[i] if op_has_argument(op, opc): if isinstance(code[i + 1], str): arg = ord(code[i + 1]) | extended_arg else: arg = code[i + 1] | extended_arg extended_arg = (arg << 8) if op == opc.EXTENDED_ARG else 0 else: arg = None yield (i, op, arg)
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'both' if show_asm in ('both', 'before'): bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # list of tokens/instructions tokens = [] # "customize" is a dict whose keys are nonterminals # and the value is the argument stack entries for that # nonterminal. The count is a little hoaky. It is mostly # not used, but sometimes it is. customize = {} if self.is_pypy: customize['PyPy'] = 0 self.code = array('B', co.co_code) self.build_lines_data(co) self.build_prev_op() bytecode = Bytecode(co, self.opc) # FIXME: put as its own method? # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() bs = list(bytecode) n = len(bs) for i in range(n): inst = bs[i] # We need to detect the difference between # "raise AssertionError" and "assert" # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n: next_inst = bs[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): for j in range(i+2, n): raise_inst = bs[j] if raise_inst.opname.startswith('RAISE_VARARGS'): if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD': self.load_asserts.add(next_inst.offset) pass break pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) # print("XXX2", jump_targets) last_op_was_break = False extended_arg = 0 for i, inst in enumerate(bytecode): argval = inst.argval op = inst.opcode has_arg = op_has_argument(op, self.opc) if has_arg: if op == self.opc.EXTENDED_ARG: extended_arg += self.extended_arg_val(argval) # Normally we remove EXTENDED_ARG from the # opcodes, but in the case of annotated functions # can use the EXTENDED_ARG tuple to signal we have # an annotated function. if not bs[i+1].opname.startswith("MAKE_FUNCTION"): continue if isinstance(argval, int) and extended_arg: min_extended= self.extended_arg_val(1) if argval < min_extended: argval += extended_arg extended_arg = 0 if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opname_for_offset(jump_offset) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass elif inst.offset in self.except_targets: come_from_name = 'COME_FROM_EXCEPT_CLAUSE' if self.version <= 3.2: continue pass tokens.append(Token(come_from_name, None, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append(Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg = True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname if opname in ['LOAD_CONST']: const = argval if iscode(const): if const.co_name == '<lambda>': opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): if self.version >= 3.6: # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION' flags = argval opname = 'MAKE_FUNCTION_%d' % (flags) attr = [] for flag in self.MAKE_FUNCTION_FLAGS: bit = flags & 1 if bit: if pattr: pattr += ", " + flag else: pattr += flag attr.append(bit) flags >>= 1 attr = attr[:4] # remove last value: attr[5] == False else: pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) attr = (pos_args, name_pair_args, annotate_args) tokens.append( Token( opname = opname, attr = attr, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = op_has_argument(op, op3), opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing argval before_args = argval & 0xFF after_args = (argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = argval # FIXME: 0 isn't always correct target = self.get_target(inst.offset, 0) if target <= inst.offset: next_opname = self.opname[self.code[inst.offset+3]] if (inst.offset in self.stmts and (self.version != 3.0 or (hasattr(inst, 'linestart'))) and (next_opname not in ('END_FINALLY', 'POP_BLOCK', # Python 3.0 only uses POP_TOP 'POP_TOP'))): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].kind == 'JUMP_BACK' and tokens[-1].attr <= argval: if tokens[-2].kind == 'BREAK_LOOP': del tokens[-1] else: # intern is used because we are changing the *previous* token tokens[-1].kind = intern('CONTINUE') if last_op_was_break and opname == 'CONTINUE': last_op_was_break = False continue elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' last_op_was_break = opname == 'BREAK_LOOP' tokens.append( Token( opname = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = (op >= op3.HAVE_ARGUMENT), opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t) print() return tokens, customize
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 'Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ show_asm = self.show_asm if not show_asm else show_asm # show_asm = 'after' if show_asm in ('both', 'before'): from xdis.bytecode import Bytecode bytecode = Bytecode(co, self.opc) for instr in bytecode.get_instructions(co): print(instr._disassemble()) # list of tokens/instructions tokens = [] # "customize" is a dict whose keys are nonterminals # and the value is the argument stack entries for that # nonterminal. The count is a little hoaky. It is mostly # not used, but sometimes it is. # "customize" is a dict whose keys are nonterminals customize = {} if self.is_pypy: customize['PyPy'] = 0 Token = self.Token # shortcut codelen = self.setup_code(co) self.build_lines_data(co, codelen) self.build_prev_op(codelen) free, names, varnames = self.unmangle_code_names(co, classname) self.names = names # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() for i in self.op_range(0, codelen): # We need to detect the difference between: # raise AssertionError # and # assert ... # Below we use the heuristic that it is preceded by a POP_JUMP. # however we could also use followed by RAISE_VARARGS # or for PyPy there may be a JUMP_IF_NOT_DEBUG before. # FIXME: remove uses of PJIF, and PJIT if self.is_pypy: have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT) else: have_pop_jump = self.code[i] == self.opc.PJIT if have_pop_jump and self.code[i + 3] == self.opc.LOAD_GLOBAL: if names[self.get_argument(i + 3)] == 'AssertionError': self.load_asserts.add(i + 3) jump_targets = self.find_jump_targets(show_asm) # contains (code, [addrRefToCode]) last_stmt = self.next_stmt[0] i = self.next_stmt[last_stmt] replace = {} while i < codelen - 1: if self.lines[last_stmt].next > i: # Distinguish "print ..." from "print ...," if self.code[last_stmt] == self.opc.PRINT_ITEM: if self.code[i] == self.opc.PRINT_ITEM: replace[i] = 'PRINT_ITEM_CONT' elif self.code[i] == self.opc.PRINT_NEWLINE: replace[i] = 'PRINT_NEWLINE_CONT' last_stmt = i i = self.next_stmt[i] extended_arg = 0 for offset in self.op_range(0, codelen): if offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[offset], reverse=True): # if jump_offset == last_offset: # continue # last_offset = jump_offset come_from_name = 'COME_FROM' op_name = self.opname_for_offset(jump_offset) if op_name.startswith('SETUP_') and self.version == 2.7: come_from_type = op_name[len('SETUP_'):] if come_from_type not in ('LOOP', 'EXCEPT'): come_from_name = 'COME_FROM_%s' % come_from_type pass tokens.append( Token(come_from_name, None, repr(jump_offset), offset="%s_%d" % (offset, jump_idx), has_arg=True)) jump_idx += 1 pass op = self.code[offset] op_name = self.op_name(op) oparg = None pattr = None has_arg = op_has_argument(op, self.opc) if has_arg: oparg = self.get_argument(offset) + extended_arg extended_arg = 0 if op == self.opc.EXTENDED_ARG: extended_arg = oparg * L65536 continue if op in self.opc.CONST_OPS: const = co.co_consts[oparg] if iscode(const): oparg = const if const.co_name == '<lambda>': assert op_name == 'LOAD_CONST' op_name = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': op_name = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': op_name = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': op_name = 'LOAD_SETCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: pattr = const elif op in self.opc.NAME_OPS: pattr = names[oparg] elif op in self.opc.JREL_OPS: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(offset + 3 + oparg) elif op in self.opc.JABS_OPS: # use instead: hasattr(self, 'patch_continue'): ? if self.version == 2.7: self.patch_continue(tokens, offset, op) pattr = repr(oparg) elif op in self.opc.LOCAL_OPS: pattr = varnames[oparg] elif op in self.opc.COMPARE_OPS: pattr = self.opc.cmp_op[oparg] elif op in self.opc.FREE_OPS: pattr = free[oparg] if op in self.varargs_ops: # CE - Hack for >= 2.5 # Now all values loaded via LOAD_CLOSURE are packed into # a tuple before calling MAKE_CLOSURE. if op == self.opc.BUILD_TUPLE and \ self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE: continue else: if self.is_pypy and not oparg and op_name == 'BUILD_MAP': op_name = 'BUILD_MAP_n' else: op_name = '%s_%d' % (op_name, oparg) if op != self.opc.BUILD_SLICE: customize[op_name] = oparg elif self.is_pypy and op_name in ('LOOKUP_METHOD', 'JUMP_IF_NOT_DEBUG', 'SETUP_EXCEPT', 'SETUP_FINALLY'): # The value in the dict is in special cases in semantic actions, such # as CALL_FUNCTION. The value is not used in these cases, so we put # in arbitrary value 0. customize[op_name] = 0 elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. target = self.get_target(offset) if target <= offset: op_name = 'JUMP_BACK' if (offset in self.stmts and self.code[offset + 3] not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)): if ((offset in self.linestartoffsets and self.code[self.prev[offset]] == self.opc.JUMP_ABSOLUTE) or offset not in self.not_continue): op_name = 'CONTINUE' elif op == self.opc.LOAD_GLOBAL: if offset in self.load_asserts: op_name = 'LOAD_ASSERT' elif op == self.opc.RETURN_VALUE: if offset in self.return_end_ifs: op_name = 'RETURN_END_IF' if offset in self.linestartoffsets: linestart = self.linestartoffsets[offset] else: linestart = None if offset not in replace: tokens.append( Token(op_name, oparg, pattr, offset, linestart, op, has_arg, self.opc)) else: tokens.append( Token(replace[offset], oparg, pattr, offset, linestart, op, has_arg, self.opc)) pass pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def find_jump_targets(self, debug): """ Detect all offsets in a byte code which are jump targets where we might insert a pseudo "COME_FROM" instruction. "COME_FROM" instructions are used in detecting overall control flow. The more detailed information about the control flow is captured in self.structs. Since this stuff is tricky, consult self.structs when something goes amiss. Return the list of offsets. An instruction can be jumped to in from multiple instructions. """ code = self.code n = len(code) self.structs = [{'type': 'root', 'start': 0, 'end': n - 1}] # All loop entry points self.loops = [] # Map fixed jumps to their real destination self.fixed_jumps = {} self.ignore_if = set() self.build_statement_indices() # Containers filled by detect_control_flow() self.not_continue = set() self.return_end_ifs = set() self.setup_loop_targets = {} # target given setup_loop offset self.setup_loops = {} # setup_loop offset given target self.thens = {} # JUMP_IF's that separate the 'then' part of an 'if' targets = {} for offset in self.op_range(0, n): op = code[offset] # Determine structures and fix jumps in Python versions # since 2.3 self.detect_control_flow(offset, op) if op_has_argument(op, self.opc): label = self.fixed_jumps.get(offset) oparg = self.get_argument(offset) if label is None: if op in self.opc.JREL_OPS and self.op_name( op) != 'FOR_ITER': # if (op in self.opc.JREL_OPS and # (self.version < 2.0 or op != self.opc.FOR_ITER)): label = offset + 3 + oparg elif self.version == 2.7 and op in self.opc.JABS_OPS: if op in (self.opc.JUMP_IF_FALSE_OR_POP, self.opc.JUMP_IF_TRUE_OR_POP): if (oparg > offset): label = oparg pass pass # FIXME: All the < 2.7 conditions are is horrible. We need a better way. if label is not None and label != -1: # In Python < 2.7, the POP_TOP in: # RETURN_VALUE, POP_TOP # does now start a new statement # Otherwise, we have want to add a "COME_FROM" if not (self.version < 2.7 and code[label] == self.opc.POP_TOP and code[self.prev[label]] == self.opc.RETURN_VALUE): # In Python < 2.7, don't add a COME_FROM, for: # JUMP_FORWARD, END_FINALLY # or: # JUMP_FORWARD, POP_TOP, END_FINALLY if not (self.version < 2.7 and op == self.opc.JUMP_FORWARD and ((code[offset + 3] == self.opc.END_FINALLY) or (code[offset + 3] == self.opc.POP_TOP and code[offset + 4] == self.opc.END_FINALLY))): # FIXME: rocky: I think we need something like this... if offset not in set( self.ignore_if) or self.version == 2.7: source = (self.setup_loops[label] if label in self.setup_loops else offset) targets[label] = targets.get(label, []) + [source] pass pass pass elif op == self.opc.END_FINALLY and offset in self.fixed_jumps and self.version == 2.7: label = self.fixed_jumps[offset] targets[label] = targets.get(label, []) + [offset] pass pass # DEBUG: if debug in ('both', 'after'): print(targets) import pprint as pp pp.pprint(self.structs) return targets