def _addcompare(self, opname, opcode, arg, argrepr, target): arg, argval = self._mapcompare(arg) if not argrepr: argrepr = argval instr = dis.Instruction(opname, opcode, arg, argrepr, argrepr, self._offset, self._line, target) return self._addinstr(instr)
def mk_extended_arg(arg, extended): return dis.Instruction(opname='EXTENDED_ARG', opcode=dis.EXTENDED_ARG, arg=arg, argval=arg, argrepr=None, offset=extended.offset, starts_line=extended.starts_line, is_jump_target=extended.is_jump_target)
def _addmisc(self, opname, opcode, arg, argrepr, target): if not argrepr: argrepr = arg try: arg = int(arg) except ValueError: raise SyntaxError("{} requires numeric arg, not {}".format( opname, arg)) instr = dis.Instruction(opname, opcode, arg, arg, argrepr, self._offset, self._line, target) return self._addinstr(instr)
def _make_stable(gen): for instr in gen: yield _dis.Instruction( instr.opname, instr.opcode, instr.arg, instr.argval, _stable_repr(instr.argval), instr.offset, instr.starts_line, instr.is_jump_target, )
def _addnamed(self, opname, opcode, arg, argrepr, target, addnames): if opcode in self._hasconst: namelist = self.constants elif opcode in self._hasfree: namelist = self.freevars elif opcode in self._hasname: namelist = self.names elif opcode in self._haslocal: namelist = self.varnames try: arg = int(arg) except ValueError: pass else: try: argval = namelist[arg] except LookupError: # TODO: Warn about this? argval = '#{}'.format(arg) if not argrepr: argrepr = argval instr = dis.Instruction(opname, opcode, arg, argval, argrepr, self._offset, self._line, target) return self._addinstr(instr) if arg[0] == '#': arg = ast.literal_eval(arg[1:]) argval = arg try: arg = namelist.index(argval) except ValueError: if addnames: arg = len(namelist) namelist.append(argval) else: raise SyntaxError("No such name '{}'".format(argval)) if not argrepr: argrepr = argval instr = dis.Instruction(opname, opcode, arg, argval, argrepr, self._offset, self._line, target) return self._addinstr(instr)
def _make_stable( gen: Iterable[_dis.Instruction], ) -> Generator[_dis.Instruction, None, None]: for instr in gen: yield _dis.Instruction( instr.opname, instr.opcode, instr.arg, instr.argval, _stable_repr(instr.argval), instr.offset, instr.starts_line, instr.is_jump_target, )
def new_instruction(instr, *, opname=None, opcode=None, arg=None, argval=None, argrepr=None, offset=None, starts_line=None, is_jump_target=None): # Creates a new instruction since namedtuples aren't mutable return dis.Instruction(opname or instr.opname, opcode or instr.opcode, arg or instr.arg, argval or instr.argval, argrepr or instr.argrepr, offset or instr.offset, starts_line or instr.starts_line, is_jump_target or instr.is_jump_target)
arg = code[i + 1] | extended_arg else: arg = None # in stock dis, this is done only in the HAVE_ARGUMENT branch # and that is wrong, since it is different from ceval.c logic extended_arg = (arg << 8) if op == dis.EXTENDED_ARG else 0 yield (i, op, arg) dis._unpack_opargs = _unpack_opargs ANY_ASCII = ord('S') empty_instr = dis.Instruction(opname=None, opcode=None, arg=None, argval=None, argrepr=None, offset=None, starts_line=None, is_jump_target=None) class CodeWrapper: def __init__(self, code, **attrs): self.__dict__.update(attrs) self.code = code def __getattr__(self, attr): return getattr(self.code, attr) class Transcoder:
def __init__(self, code): self.__edge_num = {} bytecode = dis.Bytecode(code) self.basic_blocks = { -1: BasicBlock( dis.Instruction('FUNCTION_EXIT', 0, 0, '', '', -1, 0, False)) } self._blockstack = BlockStack() # maintains the list of reachable instructions reachable_instructions = {0} # the targets of unreachable jump instructions unreachable_jump_targets = set() def is_reachable(instr): if instr.offset in reachable_instructions: return True elif instr.is_jump_target: if instr.offset in unreachable_jump_targets: return False return True def predecessors_of(current_bb): for bb in self.basic_blocks.values(): if current_bb.offset in bb.successors: yield bb def join_blockstack_views(current_bb): blocks = set() for bb in predecessors_of(current_bb): if not is_reachable(bb.instruction): continue try: view = bb.blockstack_view.pop_until(current_bb.offset) except NotOnStackException: view = bb.blockstack_view blocks.add(view.last_block) assert len(blocks) <= 1, \ (blocks, current_bb, list(predecessors_of(current_bb))) if blocks: last_block = blocks.pop() else: last_block = None return BlockStackView(self._blockstack, last_block) def join_path_metadata(current_bb): metadata = {} for bb in predecessors_of(current_bb): if bb.path_metadata.get('has return', False): metadata['has return'] = True if bb.path_metadata.get('has except', False): metadata['has except'] = True broken_loops = bb.path_metadata.get('broken loops', []) metadata.setdefault('broken loops', []).extend(broken_loops) return metadata for instr in bytecode: if not is_reachable(instr): if instr.opname in ops.jumps: unreachable_jump_targets.add(instr.argval) continue bb = BasicBlock(instr) self.basic_blocks[bb.offset] = bb # TODO: maintain path metdata (stuff like whether there's a # RETURN_VALUE along the path) that gets inherited like the # blockstack view successors, new_metadata, blockstack_view = compute_jump_targets( instr, join_path_metadata(bb), join_blockstack_views(bb), ) reachable_instructions.update(set(successors)) bb.successors = successors bb.blockstack_view = blockstack_view bb.path_metadata = new_metadata
def __init__(self, myfn): def lstadd(hmap, key, val): if key not in hmap: hmap[key] = [val] else: hmap[key].append(val) enter = CFGNode( dis.Instruction('NOP', opcode=dis.opmap['NOP'], arg=0, argval=0, argrepr=0, offset=0, starts_line=0, is_jump_target=False), 0) last = enter self.jump_to = {} self.opcodes = {} for i, ins in enumerate(dis.get_instructions(myfn)): byte = i * 2 node = CFGNode(ins, byte) self.opcodes[byte] = node print(i, ins) if ins.opname in [ 'LOAD_CONST', 'LOAD_FAST', 'STORE_FAST', 'COMPARE_OP', 'INPLACE_ADD', 'INPLACE_SUBTRACT', 'RETURN_VALUE', 'BINARY_MODULO', 'POP_BLOCK' ]: last.add_child(node) last = node elif ins.opname == 'POP_JUMP_IF_FALSE': print("will jump to", ins.arg) lstadd(self.jump_to, ins.arg, node) node.props['jmp'] = True last.add_child(node) last = node elif ins.opname == 'JUMP_FORWARD': node.props['jmp'] = True lstadd(self.jump_to, (i + 1) * 2 + ins.arg, node) print("will jump to", (i + 1) * 2 + ins.arg) last.add_child(node) last = node elif ins.opname == 'SETUP_LOOP': print("setuploop: ", byte, ins.arg) last.add_child(node) last = node elif ins.opname == 'JUMP_ABSOLUTE': print("will jump to", ins.arg) lstadd(self.jump_to, ins.arg, node) node.props['jmp'] = True last.add_child(node) last = node else: assert False for byte in self.opcodes: if byte in self.jump_to: node = self.opcodes[byte] assert node.i.is_jump_target for b in self.jump_to[byte]: b.add_child(node)
def execute(self, starting_stack=[], starting_env={}): super().execute(starting_stack, starting_env) # We use a separate instance of the decompiler to process the code # inside the loop. We have to add a placeholder for the instruction # following the end of the loop. instructions = self.instructions + \ [dis.Instruction('AFTER_LOOP', -1, None, None, None, self.instruction.argval, None, True)] # For some reason, breaks are translated into BREAK_LOOP instructions # instead of the standard JUMP_ABSOLUTE, so we must fix that manually. for i in range(len(instructions)): instr = instructions[i] if instr.opname == 'BREAK_LOOP': instructions[i] = dis.Instruction( 'JUMP_ABSOLUTE', 113, self.instruction.argval, self.instruction.argval, None, instr.offset, instr.starts_line, instr.is_jump_target) decompiler = Decompiler() decompiler.comprehension_mode = self.context.comprehension_mode decompiler.build_graph(instructions, True) start_block = decompiler.first_block last_block = decompiler.current_block decompiler.sort_blocks() decompiler.detach_unreachable() # display_graph(decompiler) # We then identify the edges which jump back to the start_block, and # make them point to a placeholder block instead. This block, once # expressed, will turn into a call to on_loop. loop_placeholder = PlaceholderBlock( decompiler, Application(Identifier('on_loop'), Null())) decompiler.blocks.append(loop_placeholder) previous_predecessors = start_block.predecessors start_block.predecessors = [] for (predecessor, edge_type) in previous_predecessors: if predecessor.index < start_block.index: start_block.predecessors.append((predecessor, edge_type)) elif edge_type == JUMP_FLOW: predecessor.next_jumped = loop_placeholder loop_placeholder.predecessors.append( (predecessor, JUMP_FLOW)) else: predecessor.next = loop_placeholder loop_placeholder.predecessors.append( (predecessor, NORMAL_FLOW)) # We also replace all the references to the last block, which only # contains the AFTER_LOOP instruction that we added earlier, with a # placeholder block which will turn into a call to on_after. after_placeholder = PlaceholderBlock( decompiler, Application(Identifier('on_after'), Null())) after_placeholder.index = last_block.index decompiler.blocks[last_block.index] = after_placeholder for (predecessor, edge_type) in last_block.predecessors: if edge_type == JUMP_FLOW: predecessor.next_jumped = after_placeholder after_placeholder.predecessors.append( (predecessor, JUMP_FLOW)) else: predecessor.next = after_placeholder after_placeholder.predecessors.append( (predecessor, NORMAL_FLOW)) # This is not pretty, but we must remove the edge that is created # between a block and the one which follows it. loop_placeholder.next = None after_placeholder.next = None # display_graph(decompiler) self.loop_placeholder, self.after_placeholder, self.decompiler =\ loop_placeholder, after_placeholder, decompiler
def _addjump(self, opname, opcode, arg, argrepr, target): # Nothing much to do here; see _fixup for the hard bit instr = dis.Instruction(opname, opcode, arg, argrepr, argrepr, self._offset, self._line, target) return self._addinstr(instr)
def _addnoarg(self, opname, opcode, target): instr = dis.Instruction(opname, opcode, None, None, None, self._offset, self._line, target) return self._addinstr(instr)
def instrument(bytecode): """ The primary method of instrumenting code, which involves injecting a bytecode counter between every instruction to be executed :param bytecode: a code object, the bytecode submitted by the player :return: a new code object that has been injected with our bytecode counter """ # Ensure all code constants (e.g. list comprehensions) are also instrumented. new_consts = [] for i, constant in enumerate(bytecode.co_consts): if type(constant) == CodeType: new_consts.append(Instrument.instrument(constant)) else: new_consts.append(constant) new_consts = tuple(new_consts) instructions = list(dis.get_instructions(bytecode)) function_name_index = len(bytecode.co_names) # we will be inserting our __instrument__ call at the end of co_names # the injection, which consists of a function call to an __instrument__ method which increments bytecode # these three instructions will be inserted between every line of instrumented code injection = [ dis.Instruction(opcode=116, opname='LOAD_GLOBAL', arg=function_name_index%256, argval='__instrument__', argrepr='__instrument__', offset=None, starts_line=None, is_jump_target=False), dis.Instruction(opcode=131, opname='CALL_FUNCTION', arg=0, argval=0, argrepr=0, offset=None, starts_line=None, is_jump_target=False), dis.Instruction(opcode=1, opname='POP_TOP', arg=None, argval=None, argrepr=None, offset=None, starts_line=None, is_jump_target=False) ] #extends the opargs so that it can store the index of __instrument__ while function_name_index > 255: #(255 = 2^8 -1 = 1 oparg) function_name_index >>= 8 injection = [ dis.Instruction( opcode=144, opname='EXTENDED_ARGS', arg=function_name_index%256, argval=function_name_index%256, argrepr=function_name_index%256, offset=None, starts_line=None, is_jump_target=False ) ] + injection # For maintenance we add an empty jump_to field to each instruction for i, instruction in enumerate(instructions): instructions[i] = Instruction(instruction) # Next, we cache a reference to the jumpers to each jump target in the targets for i, instruction in enumerate(instructions): # We're only looking for jumpers if not instruction.is_jumper(): continue target = [t for t in instructions if instruction.argval == t.offset][0] instruction.jump_to = target # If any targets jump to themselves, that's not kosher. if instruction == target: raise SyntaxError('No self-referential loops.') unsafe = {110, 113, 114, 115, 116, 120, 124, 125, 131} # bytecode ops that break the instrument # We then inject the injection before every call, except for those following an EXTENDED_ARGS. cur_index = -1 for (cur, last) in zip(instructions[:], [None]+instructions[:-1]): cur_index += 1 if last is not None and last.opcode == 144: #EXTEND_ARG continue if last is not None and last.opcode in unsafe: continue for j, inject in enumerate(injection): injected_instruction = Instruction(inject) injected_instruction.was_there = False # keeping track of the instructions added by us instructions.insert(cur_index + j, injected_instruction) cur_index += len(injection) # Iterate through instructions. If it's a jumper, calculate the new correct offset. For each new offset, if it # is too large to fit in the current number of EXTENDED_ARGS, inject a new EXTENDED_ARG before it. If you never # insert a new EXTENDED_ARGS, break out of the loop. fixed = False while not fixed: fixed = True i = 0 for instruction in instructions[:]: instruction.offset = 2 * i if not instruction.is_jumper(): i += 1 continue correct_offset = instruction.calculate_offset(instructions) instruction.arg = correct_offset % 256 correct_offset >>= 8 extended_args = 0 while correct_offset > 0: # Check if there is already an EXTENDED_ARGS behind if i > extended_args and instructions[i - extended_args - 1].opcode == 144: instructions[i - extended_args - 1].arg = correct_offset % 256 # Otherwise, insert a new one else: instructions.insert(i, Instruction.ExtendedArgs(correct_offset % 256)) instruction.extra_extended_args += 1 i += 1 fixed = False correct_offset >>= 8 extended_args += 1 i += 1 #Maintaining correct line info ( traceback bug fix) #co_lnotab stores line information in Byte form # It stores alterantively, the number of instructions to the next increase in line number and # the increase in line number then #We need to ensure that these are bytes (You might want to break an increase into two see the article or code below) #The code did not update these bytes, we need to update the number of instructions before the beginning of each line #It should be similar to the way the jump to statement were fixed, I tried to mimick them but failed, I feel like I do not inderstand instruction.py # I am overestimating the number of instructions before the start of the line in this fix # you might find the end of this article helpful: https://towardsdatascience.com/understanding-python-bytecode-e7edaae8734d old_lnotab = {} #stores the old right info in a more usefull way (maps instruction num to line num) i = 0 line_num = 0 #maintains line number by adding differences instruction_num = 0 #maintains the instruction num by addind differences while 2*i < len(bytecode.co_lnotab): instruction_num += bytecode.co_lnotab[2 * i] line_num += bytecode.co_lnotab[2 * i + 1] old_lnotab[instruction_num] = line_num i += 1 #Construct a map from old instruction numbers, to new ones. num_injected = 0 instruction_index = 0 old_to_new_instruction_num = {} for instruction in instructions: if instruction.was_there: old_to_new_instruction_num[2 * (instruction_index - num_injected)] = 2 * instruction_index instruction_index += 1 if not instruction.was_there: num_injected += 1 new_lnotab = {} for key in old_lnotab: new_lnotab[old_to_new_instruction_num[key]] = old_lnotab[key] #Creating a differences list of integers, while ensuring integers in it are bytes pairs = sorted(new_lnotab.items()) new_lnotab = [] previous_pair = (0, 0) for pair in pairs: num_instructions = pair[0] - previous_pair[0] num_lines = pair[1] - previous_pair[1] while num_instructions > 127: new_lnotab.append(127) new_lnotab.append(0) num_instructions -= 127 new_lnotab.append(num_instructions) while num_lines > 127: new_lnotab.append(127) new_lnotab.append(0) num_lines -= 127 new_lnotab.append(num_lines) previous_pair = pair #tranfer to bytes and we are good :) new_lnotab = bytes(new_lnotab) # Finally, we repackage up our instructions into a byte string and use it to build a new code object byte_array = [[inst.opcode, 0 if inst.arg is None else inst.arg % 256] for inst in instructions] new_code = bytes(sum(byte_array, [])) # Make sure our code can locate the __instrument__ call new_names = tuple(bytecode.co_names) + ('__instrument__', ) return Instrument.build_code(bytecode, new_code, new_names, new_consts, new_lnotab)