def reduce_is_invalid(self, rule, ast, tokens, first, last): invalid = super(Python27Parser, self).reduce_is_invalid(rule, ast, tokens, first, last) if invalid: return invalid if rule == ('and', ('expr', 'jmp_false', 'expr', '\\e_come_from_opt')): # If the instruction after the instructions formin "and" is an "YIELD_VALUE" # then this is probably an "if" inside a comprehension. if tokens[last] == 'YIELD_VALUE': # Note: We might also consider testing last+1 being "POP_TOP" return True # Test that jmp_false jumps to the end of "and" # or that it jumps to the same place as the end of "and" jmp_false = ast[1][0] jmp_target = jmp_false.offset + jmp_false.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_false.pattr) elif rule[0] == ('raise_stmt1'): return ast[0] == 'expr' and ast[0][0] == 'or' elif rule[0] in ('assert', 'assert2'): jump_inst = ast[1][0] jump_target = jump_inst.attr return not (last >= len(tokens) or jump_target == tokens[last].offset or jump_target == next_offset( ast[-1].op, ast[-1].opc, ast[-1].offset)) elif rule == ('list_if_not', ('expr', 'jmp_true', 'list_iter')): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ('list_if', ('expr', 'jmp_false', 'list_iter')): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ('or', ('expr', 'jmp_true', 'expr', '\\e_come_from_opt')): # Test that jmp_true doesn't jump inside the middle the "or" # or that it jumps to the same place as the end of "and" jmp_true = ast[1][0] jmp_target = jmp_true.offset + jmp_true.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_true.pattr) elif (rule[0] == 'whilestmt' and rule[1][0:-2] == ('SETUP_LOOP', 'testexpr', 'l_stmts_opt', 'JUMP_BACK', 'JUMP_BACK')): # Make sure that the jump backs all go to the same place i = last - 1 while (tokens[i] != 'JUMP_BACK'): i -= 1 return tokens[i].attr != tokens[i - 1].attr elif rule[0] == 'if_expr_true': return (first) > 0 and tokens[first - 1] == 'POP_JUMP_IF_FALSE' return False
def reduce_is_invalid(self, rule, ast, tokens, first, last): invalid = super(Python27Parser, self).reduce_is_invalid(rule, ast, tokens, first, last) if invalid: return invalid if rule == ('and', ('expr', 'jmp_false', 'expr', '\\e_come_from_opt')): # Test that jmp_false jumps to the end of "and" # or that it jumps to the same place as the end of "and" jmp_false = ast[1][0] jmp_target = jmp_false.offset + jmp_false.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_false.pattr) elif rule[0] == ('raise_stmt1'): return ast[0] == 'expr' and ast[0][0] == 'or' elif rule[0] in ('assert', 'assert2'): jump_inst = ast[1][0] jump_target = jump_inst.attr return not (last >= len(tokens) or jump_target == tokens[last].offset or jump_target == next_offset( ast[-1].op, ast[-1].opc, ast[-1].offset)) elif rule == ('list_if_not', ('expr', 'jmp_true', 'list_iter')): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ('list_if', ('expr', 'jmp_false', 'list_iter')): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ('or', ('expr', 'jmp_true', 'expr', '\\e_come_from_opt')): # Test that jmp_true doesn't jump inside the middle the "or" # or that it jumps to the same place as the end of "and" jmp_true = ast[1][0] jmp_target = jmp_true.offset + jmp_true.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_true.pattr) elif (rule[0] == 'whilestmt' and rule[1][0:-2] == ('SETUP_LOOP', 'testexpr', 'l_stmts_opt', 'JUMP_BACK', 'JUMP_BACK')): # Make sure that the jump backs all go to the same place i = last - 1 while (tokens[i] != 'JUMP_BACK'): i -= 1 return tokens[i].attr != tokens[i - 1].attr # elif rule[0] == ('conditional_true'): # # FIXME: the below is a hack: we check expr for # # nodes that could have possibly been a been a Boolean. # # We should also look for the presence of dead code. # return ast[0] == 'expr' and ast[0] == 'or' return False
def get_target(self, offset, extended_arg=0): """ Get next instruction offset for op located at given <offset>. NOTE: extended_arg is no longer used """ inst = self.get_inst(offset) if inst.opcode in self.opc.JREL_OPS | self.opc.JABS_OPS: target = inst.argval else: # No jump offset, so use fall-through offset target = next_offset(inst.opcode, self.opc, inst.offset) return target
def detect_control_flow(self, offset, targets, inst_index): """ Detect structures and their boundaries to fix optimized jumps Python 3.0 is more like Python 2.6 than it is Python 3.x. So we have a special routine here. """ code = self.code op = self.insts[inst_index].opcode # Detect parent structure parent = self.structs[0] start = parent["start"] end = parent["end"] # Pick inner-most parent for our offset for struct in self.structs: current_start = struct["start"] current_end = struct["end"] if (current_start <= offset < current_end) and ( current_start >= start and current_end <= end): start = current_start end = current_end parent = struct if op == self.opc.SETUP_LOOP: # We categorize loop types: 'for', 'while', 'while 1' with # possibly suffixes '-loop' and '-else' # Try to find the jump_back instruction of the loop. # It could be a return instruction. start += instruction_size(op, self.opc) target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.setup_loops[target] = offset if target != end: self.fixed_jumps[offset] = end (line_no, next_line_byte) = self.lines[offset] jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False) if jump_back: jump_forward_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) else: jump_forward_offset = None return_val_offset1 = self.prev[self.prev[end]] if (jump_back and jump_back != self.prev_op[end] and self.is_jump_forward(jump_forward_offset)): if code[self.prev_op[end]] == self.opc.RETURN_VALUE or ( code[self.prev_op[end]] == self.opc.POP_BLOCK and code[return_val_offset1] == self.opc.RETURN_VALUE): jump_back = None if not jump_back: # loop suite ends in return jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE) if not jump_back: return jb_inst = self.get_inst(jump_back) jump_back = self.next_offset(jb_inst.opcode, jump_back) if_offset = None if code[self.prev_op[next_line_byte]] not in JUMP_TF: if_offset = self.prev[next_line_byte] if if_offset: loop_type = "while" self.ignore_if.add(if_offset) else: loop_type = "for" target = next_line_byte end = jump_back + 3 else: if self.get_target(jump_back) >= next_line_byte: jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, start, False) jb_inst = self.get_inst(jump_back) jb_next_offset = self.next_offset(jb_inst.opcode, jump_back) if end > jb_next_offset and self.is_jump_forward(end): if self.is_jump_forward(jb_next_offset): if self.get_target(jump_back + 4) == self.get_target(end): self.fixed_jumps[offset] = jump_back + 4 end = jb_next_offset elif target < offset: self.fixed_jumps[offset] = jump_back + 4 end = jb_next_offset target = self.get_target(jump_back) if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER): loop_type = "for" else: loop_type = "while" test = self.prev_op[next_line_byte] if test == offset: loop_type = "while 1" elif self.code[test] in self.opc.JUMP_OPs: self.ignore_if.add(test) test_target = self.get_target(test) if test_target > (jump_back + 3): jump_back = test_target self.not_continue.add(jump_back) self.loops.append(target) self.structs.append({ "type": loop_type + "-loop", "start": target, "end": jump_back }) after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) if self.get_inst(after_jump_offset).opname == "POP_TOP": after_jump_offset = xdis.next_offset(code[after_jump_offset], self.opc, after_jump_offset) if after_jump_offset != end: self.structs.append({ "type": loop_type + "-else", "start": after_jump_offset, "end": end, }) elif op in self.pop_jump_tf: start = offset + instruction_size(op, self.opc) target = self.get_target(offset) rtarget = self.restrict_to_parent(target, parent) prev_op = self.prev_op # Do not let jump to go out of parent struct bounds if target != rtarget and parent["type"] == "and/or": self.fixed_jumps[offset] = rtarget return # Does this jump to right after another conditional jump that is # not myself? If so, it's part of a larger conditional. # rocky: if we have a conditional jump to the next instruction, then # possibly I am "skipping over" a "pass" or null statement. if ((code[prev_op[target]] in self.pop_jump_if_pop) and (target > offset) and prev_op[target] != offset): self.fixed_jumps[offset] = prev_op[target] self.structs.append({ "type": "and/or", "start": start, "end": prev_op[target] }) return # The op offset just before the target jump offset is important # in making a determination of what we have. Save that. pre_rtarget = prev_op[rtarget] # Is it an "and" inside an "if" or "while" block if op == opc.JUMP_IF_FALSE: # Search for another JUMP_IF_FALSE targetting the same op, # in current statement, starting from current offset, and filter # everything inside inner 'or' jumps and midline ifs match = self.rem_or(start, self.next_stmt[offset], opc.JUMP_IF_FALSE, target) # If we still have any offsets in set, start working on it if match: is_jump_forward = self.is_jump_forward(pre_rtarget) if (is_jump_forward and pre_rtarget not in self.stmts and self.restrict_to_parent( self.get_target(pre_rtarget), parent) == rtarget): if (code[prev_op[pre_rtarget]] == self.opc.JUMP_ABSOLUTE and self.remove_mid_line_ifs([offset]) and target == self.get_target( prev_op[pre_rtarget]) and (prev_op[pre_rtarget] not in self.stmts or self.get_target(prev_op[pre_rtarget]) > prev_op[pre_rtarget]) and 1 == len( self.remove_mid_line_ifs( self.rem_or(start, prev_op[pre_rtarget], JUMP_TF, target)))): pass elif (code[prev_op[pre_rtarget]] == self.opc.RETURN_VALUE and self.remove_mid_line_ifs([offset]) and 1 == (len( set( self.remove_mid_line_ifs( self.rem_or( start, prev_op[pre_rtarget], JUMP_TF, target, ))) | set( self.remove_mid_line_ifs( self.rem_or( start, prev_op[pre_rtarget], ( opc.JUMP_IF_FALSE, opc.JUMP_IF_TRUE, opc.JUMP_ABSOLUTE, ), pre_rtarget, True, )))))): pass else: fix = None jump_ifs = self.inst_matches( start, self.next_stmt[offset], opc.JUMP_IF_FALSE) last_jump_good = True for j in jump_ifs: if target == self.get_target(j): # FIXME: remove magic number if self.lines[ j].next == j + 3 and last_jump_good: fix = j break else: last_jump_good = False self.fixed_jumps[offset] = fix or match[-1] return else: self.fixed_jumps[offset] = match[-1] return # op == JUMP_IF_TRUE else: next = self.next_stmt[offset] if prev_op[next] == offset: pass elif self.is_jump_forward(next) and target == self.get_target( next): if code[prev_op[next]] == opc.JUMP_IF_FALSE: if (code[next] == self.opc.JUMP_FORWARD or target != rtarget or code[prev_op[pre_rtarget]] not in (self.opc.JUMP_ABSOLUTE, self.opc.RETURN_VALUE)): self.fixed_jumps[offset] = prev_op[next] return elif (code[next] == self.opc.JUMP_ABSOLUTE and self.is_jump_forward(target) and self.get_target(target) == self.get_target(next)): self.fixed_jumps[offset] = prev_op[next] return # Don't add a struct for a while test, it's already taken care of if offset in self.ignore_if: return if (code[pre_rtarget] == self.opc.JUMP_ABSOLUTE and pre_rtarget in self.stmts and pre_rtarget != offset and prev_op[pre_rtarget] != offset and not (code[rtarget] == self.opc.JUMP_ABSOLUTE and code[rtarget + 3] == self.opc.POP_BLOCK and code[prev_op[pre_rtarget]] != self.opc.JUMP_ABSOLUTE)): rtarget = pre_rtarget # Does the "jump if" jump beyond a jump op? # That is, we have something like: # JUMP_IF_FALSE HERE # ... # JUMP_FORWARD # HERE: # # If so, this can be block inside an "if" statement # or a conditional assignment like: # x = 1 if x else 2 # # There are other contexts we may need to consider # like whether the target is "END_FINALLY" # or if the condition jump is to a forward location if self.is_jump_forward(pre_rtarget): if_end = self.get_target(pre_rtarget, 0) # If the jump target is back, we are looping if if_end < pre_rtarget and (code[prev_op[if_end]] == self.opc.SETUP_LOOP): if if_end > start: return end = self.restrict_to_parent(if_end, parent) self.structs.append({ "type": "if-then", "start": start, "end": pre_rtarget }) self.not_continue.add(pre_rtarget) # if rtarget < end and ( # code[rtarget] not in (self.opc.END_FINALLY, # self.opc.JUMP_ABSOLUTE) and # code[prev_op[pre_rtarget]] not in (self.opc.POP_EXCEPT, # self.opc.END_FINALLY)): # self.structs.append({'type': 'else', # 'start': rtarget, # 'end': end}) # self.else_start[rtarget] = end elif self.is_jump_back(pre_rtarget, 0): if_end = rtarget self.structs.append({ "type": "if-then", "start": start, "end": pre_rtarget }) self.not_continue.add(pre_rtarget) elif code[pre_rtarget] in (self.opc.RETURN_VALUE, self.opc.BREAK_LOOP): self.structs.append({ "type": "if-then", "start": start, "end": rtarget }) # It is important to distingish if this return is inside some sort # except block return jump_prev = prev_op[offset] if self.is_pypy and code[jump_prev] == self.opc.COMPARE_OP: if self.opc.cmp_op[code[jump_prev + 1]] == "exception-match": return if self.version >= 3.5: # Python 3.5 may remove as dead code a JUMP # instruction after a RETURN_VALUE. So we check # based on seeing SETUP_EXCEPT various places. if code[rtarget] == self.opc.SETUP_EXCEPT: return # Check that next instruction after pops and jump is # not from SETUP_EXCEPT next_op = rtarget if code[next_op] == self.opc.POP_BLOCK: next_op += instruction_size(self.code[next_op], self.opc) if code[next_op] == self.opc.JUMP_ABSOLUTE: next_op += instruction_size(self.code[next_op], self.opc) if next_op in targets: for try_op in targets[next_op]: come_from_op = code[try_op] if come_from_op == self.opc.SETUP_EXCEPT: return pass pass if code[pre_rtarget] == self.opc.RETURN_VALUE: if self.version == 3.0: next_op = rtarget if code[next_op] == self.opc.POP_TOP: next_op = rtarget for block in self.structs: if (block["type"] == "while-loop" and block["end"] == next_op): return next_op += instruction_size(self.code[next_op], self.opc) if code[next_op] == self.opc.POP_BLOCK: return self.return_end_ifs.add(pre_rtarget) else: self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) elif op == self.opc.SETUP_EXCEPT: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op == self.opc.SETUP_FINALLY: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op in self.jump_if_pop: target = self.get_target(offset) if target > offset: unop_target = self.last_instr(offset, target, self.opc.JUMP_FORWARD, target) if unop_target and code[unop_target + 3] != self.opc.ROT_TWO: self.fixed_jumps[offset] = unop_target else: self.fixed_jumps[offset] = self.restrict_to_parent( target, parent) pass pass elif self.version >= 3.5: # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get # misclassified as RETURN_END_IF. Handle that here. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF if op == self.opc.RETURN_VALUE: if (offset + 1 < len(code) and code[offset + 1] == self.opc.JUMP_ABSOLUTE and offset in self.return_end_ifs): self.return_end_ifs.remove(offset) pass pass elif op == self.opc.JUMP_FORWARD: # If we have: # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x: # then RETURN_VALUE is not RETURN_END_IF rtarget = self.get_target(offset) rtarget_prev = self.prev[rtarget] if (code[rtarget_prev] == self.opc.RETURN_VALUE and rtarget_prev in self.return_end_ifs): i = rtarget_prev while i != offset: if code[i] in [opc.JUMP_FORWARD, opc.JUMP_ABSOLUTE]: return i = self.prev[i] self.return_end_ifs.remove(rtarget_prev) pass return
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an uncompyle6 code object, and transform them, returning a list of uncompyle6 Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - some EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ if not show_asm: show_asm = self.show_asm bytecode = self.build_instructions(co) # show_asm = 'both' if show_asm in ('both', 'before'): for instr in bytecode.get_instructions(co): print(instr.disassemble()) # list of tokens/instructions tokens = [] # "customize" is in the process of going away here customize = {} if self.is_pypy: customize['PyPy'] = 0 # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() n = len(self.insts) for i, inst in enumerate(self.insts): # We need to detect the difference between: # raise AssertionError # and # assert ... # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. if self.version == 3.0: # There is a an implied JUMP_IF_TRUE that we are not testing for (yet?) here assert_can_follow = inst.opname == 'POP_TOP' and i+1 < n else: assert_can_follow = inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n if assert_can_follow: next_inst = self.insts[i+1] if (next_inst.opname == 'LOAD_GLOBAL' and next_inst.argval == 'AssertionError'): if (i + 2 < n and self.insts[i+2].opname.startswith('RAISE_VARARGS')): self.load_asserts.add(next_inst.offset) pass pass # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) # print("XXX2", jump_targets) last_op_was_break = False for i, inst in enumerate(self.insts): argval = inst.argval op = inst.opcode if inst.opname == 'EXTENDED_ARG': # FIXME: The EXTENDED_ARG is used to signal annotation # parameters if (i+1 < n and self.insts[i+1].opcode != self.opc.MAKE_FUNCTION): continue if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = 'COME_FROM' opname = self.opname_for_offset(jump_offset) if opname == 'EXTENDED_ARG': j = xdis.next_offset(op, self.opc, jump_offset) opname = self.opname_for_offset(j) if opname.startswith('SETUP_'): come_from_type = opname[len('SETUP_'):] come_from_name = 'COME_FROM_%s' % come_from_type pass elif inst.offset in self.except_targets: come_from_name = 'COME_FROM_EXCEPT_CLAUSE' tokens.append(Token(come_from_name, jump_offset, repr(jump_offset), offset='%s_%s' % (inst.offset, jump_idx), has_arg = True, opc=self.opc)) jump_idx += 1 pass pass elif inst.offset in self.else_start: end_offset = self.else_start[inst.offset] tokens.append(Token('ELSE', None, repr(end_offset), offset='%s' % (inst.offset), has_arg = True, opc=self.opc)) pass pattr = inst.argrepr opname = inst.opname if op in self.opc.CONST_OPS: const = argval if iscode(const): if const.co_name == '<lambda>': assert opname == 'LOAD_CONST' opname = 'LOAD_LAMBDA' elif const.co_name == '<genexpr>': opname = 'LOAD_GENEXPR' elif const.co_name == '<dictcomp>': opname = 'LOAD_DICTCOMP' elif const.co_name == '<setcomp>': opname = 'LOAD_SETCOMP' elif const.co_name == '<listcomp>': opname = 'LOAD_LISTCOMP' # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = '<code_object ' + const.co_name + '>' else: if isinstance(inst.arg, int) and inst.arg < len(co.co_consts): argval, _ = _get_const_info(inst.arg, co.co_consts) # Why don't we use _ above for "pattr" rather than "const"? # This *is* a little hoaky, but we have to coordinate with # other parts like n_LOAD_CONST in pysource.py for example. pattr = const pass elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'): if self.version >= 3.6: # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION' flags = argval opname = 'MAKE_FUNCTION_%d' % (flags) attr = [] for flag in self.MAKE_FUNCTION_FLAGS: bit = flags & 1 attr.append(bit) flags >>= 1 attr = attr[:4] # remove last value: attr[5] == False else: pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval) pattr = ("%d positional, %d keyword pair, %d annotated" % (pos_args, name_pair_args, annotate_args)) if name_pair_args > 0: opname = '%s_N%d' % (opname, name_pair_args) pass if annotate_args > 0: opname = '%s_A_%d' % (opname, annotate_args) pass opname = '%s_%d' % (opname, pos_args) attr = (pos_args, name_pair_args, annotate_args) tokens.append( Token( opname = opname, attr = attr, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = inst.has_arg, opc = self.opc ) ) continue elif op in self.varargs_ops: pos_args = argval if self.is_pypy and not pos_args and opname == 'BUILD_MAP': opname = 'BUILD_MAP_n' else: opname = '%s_%d' % (opname, pos_args) elif self.is_pypy and opname == 'JUMP_IF_NOT_DEBUG': # The value in the dict is in special cases in semantic actions, such # as JUMP_IF_NOT_DEBUG. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == 'UNPACK_EX': # FIXME: try with scanner and parser by # changing argval before_args = argval & 0xFF after_args = (argval >> 8) & 0xff pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = '%s_%d+%d' % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Further classify JUMP_ABSOLUTE into backward jumps # which are used in loops, and "CONTINUE" jumps which # may appear in a "continue" statement. The loop-type # and continue-type jumps will help us classify loop # boundaries The continue-type jumps help us get # "continue" statements with would otherwise be turned # into a "pass" statement because JUMPs are sometimes # ignored in rules as just boundary overhead. In # comprehensions we might sometimes classify JUMP_BACK # as CONTINUE, but that's okay since we add a grammar # rule for that. pattr = argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.insts[i+1].opname # 'Continue's include jumps to loops that are not # and the end of a block which follow with POP_BLOCK and COME_FROM_LOOP. # If the JUMP_ABSOLUTE is to a FOR_ITER and it is followed by another JUMP_FORWARD # then we'll take it as a "continue". is_continue = (self.insts[self.offset2inst_index[target]] .opname == 'FOR_ITER' and self.insts[i+1].opname == 'JUMP_FORWARD') if (is_continue or (inst.offset in self.stmts and (inst.starts_line and next_opname not in self.not_continue_follow))): opname = 'CONTINUE' else: opname = 'JUMP_BACK' # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].kind == 'JUMP_BACK' and tokens[-1].attr <= argval: if tokens[-2].kind == 'BREAK_LOOP': del tokens[-1] else: # intern is used because we are changing the *previous* token tokens[-1].kind = intern('CONTINUE') if last_op_was_break and opname == 'CONTINUE': last_op_was_break = False continue # FIXME: go over for Python 3.6+. This is sometimes wrong elif op == self.opc.RETURN_VALUE: if inst.offset in self.return_end_ifs: opname = 'RETURN_END_IF' elif inst.offset in self.load_asserts: opname = 'LOAD_ASSERT' last_op_was_break = opname == 'BREAK_LOOP' tokens.append( Token( opname = opname, attr = argval, pattr = pattr, offset = inst.offset, linestart = inst.starts_line, op = op, has_arg = inst.has_arg, opc = self.opc ) ) pass if show_asm in ('both', 'after'): for t in tokens: print(t.format(line_prefix='L.')) print() return tokens, customize
def basic_blocks(version, is_pypy, fn): """Create a list of basic blocks found in a code object """ BB = BBMgr(version, is_pypy) # Get jump targets jump_targets = set() for inst in get_instructions(fn): op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if op in BB.JUMP_INSTRUCTIONS: if op in BB.JABS_INSTRUCTIONS: jump_offset = inst.arg else: jump_offset = follow_offset + inst.arg jump_targets.add(jump_offset) pass start_offset = 0 end_offset = -1 jump_offsets = set() prev_offset = -1 endloop_offsets = [-1] flags = set([BB_ENTRY]) for inst in get_instructions(fn): prev_offset = end_offset end_offset = inst.offset op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if op == BB.opcode.SETUP_LOOP: jump_offset = follow_offset + inst.arg endloop_offsets.append(jump_offset) elif offset == endloop_offsets[-1]: endloop_offsets.pop() pass if op in BB.LOOP_INSTRUCTIONS: flags.add(BB_LOOP) elif op in BB.BREAK_INSTRUCTIONS: flags.add(BB_BREAK) jump_offsets.add(endloop_offsets[-1]) flags, jump_offsets = BB.add_bb(start_offset, end_offset, follow_offset, flags, jump_offsets) start_offset = follow_offset if offset in jump_targets: # Fallthrough path and jump target path. # This instruction definitely starts a new basic block # Close off any prior basic block if start_offset < end_offset: flags, jump_offsets = BB.add_bb(start_offset, prev_offset, end_offset, flags, jump_offsets) start_offset = end_offset # Add block flags for certain classes of instructions if op in BB.BLOCK_INSTRUCTIONS: flags.add(BB_BLOCK) elif op in BB.EXCEPT_INSTRUCTIONS: flags.add(BB_EXCEPT) elif op in BB.FINALLY_INSTRUCTIONS: flags.add(BB_FINALLY) elif op in BB.FOR_INSTRUCTIONS: flags.add(BB_FOR) elif op in BB.JUMP_INSTRUCTIONS: # Some sort of jump instruction. # While in theory an absolute jump could be part of the # same (extened) basic block, for our purposes we would like to # call them two basic blocks as that probably mirrors # the code more simply. # Figure out where we jump to amd add it to this # basic block's jump offsets. if op in BB.JABS_INSTRUCTIONS: jump_offset = inst.arg else: jump_offset = follow_offset + inst.arg jump_offsets.add(jump_offset) if op in BB.JUMP_UNCONDITONAL: flags.add(BB_JUMP_UNCONDITIONAL) flags, jump_offsets = BB.add_bb(start_offset, end_offset, follow_offset, flags, jump_offsets) start_offset = follow_offset elif op != BB.opcode.SETUP_LOOP: flags, jump_offsets = BB.add_bb(start_offset, end_offset, follow_offset, flags, jump_offsets) start_offset = follow_offset pass elif op in BB.NOFOLLOW_INSTRUCTIONS: flags.add(BB_NOFOLLOW) flags, jump_offsets = BB.add_bb(start_offset, end_offset, follow_offset, flags, jump_offsets) start_offset = follow_offset pass pass if len(BB.bb_list): BB.bb_list[-1].follow_offset = None # Add remaining instructions? if start_offset <= end_offset: BB.bb_list.append( BasicBlock(start_offset, end_offset, None, flags=flags, jump_offsets=jump_offsets)) return BB.bb_list
def next_offset(self, op, offset: int) -> int: return xdis.next_offset(op, self.opc, offset)
def reduce_is_invalid(self, rule, ast, tokens, first, last): invalid = super(Python27Parser, self).reduce_is_invalid(rule, ast, tokens, first, last) if invalid: return invalid if rule == ("and", ("expr", "jmp_false", "expr", "\\e_come_from_opt")): # If the instruction after the instructions forming the "and" is an "YIELD_VALUE" # then this is probably an "if" inside a comprehension. if tokens[last] == "YIELD_VALUE": # Note: We might also consider testing last+1 being "POP_TOP" return True # Test that jump_false jump somewhere beyond the end of the "and" # it might not be exactly the end of the "and" because this and can # be a part of a larger condition. Oddly in 2.7 there doesn't seem to be # an optimization where the "and" jump_false is back to a loop. jmp_false = ast[1] if jmp_false[0] == "POP_JUMP_IF_FALSE": while (first < last and isinstance(tokens[last].offset, str)): last -= 1 if jmp_false[0].attr < tokens[last].offset: return True # Test that jmp_false jumps to the end of "and" # or that it jumps to the same place as the end of "and" jmp_false = ast[1][0] jmp_target = jmp_false.offset + jmp_false.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_false.pattr) elif rule == ("comp_if", ("expr", "jmp_false", "comp_iter")): jmp_false = ast[1] if jmp_false[0] == "POP_JUMP_IF_FALSE": return tokens[first].offset < jmp_false[0].attr < tokens[ last].offset pass elif (rule[0], rule[1][0:5]) == ("conditional", ("expr", "jmp_false", "expr", "JUMP_ABSOLUTE", "expr")): jmp_false = ast[1] if jmp_false[0] == "POP_JUMP_IF_FALSE": else_instr = ast[4].first_child() if jmp_false[0].attr != else_instr.offset: return True end_offset = ast[3].attr return end_offset < tokens[last].offset pass elif rule[0] == ("raise_stmt1"): return ast[0] == "expr" and ast[0][0] == "or" elif rule[0] in ("assert", "assert2"): jump_inst = ast[1][0] jump_target = jump_inst.attr return not (last >= len(tokens) or jump_target == tokens[last].offset or jump_target == next_offset( ast[-1].op, ast[-1].opc, ast[-1].offset)) elif rule == ("iflaststmtl", ("testexpr", "c_stmts")): testexpr = ast[0] if testexpr[0] in ("testfalse", "testtrue"): test = testexpr[0] if len(test) > 1 and test[1].kind.startswith("jmp_"): jmp_target = test[1][0].attr if last == len(tokens): last -= 1 while (isinstance(tokens[first].offset, str) and first < last): first += 1 if first == last: return True while (first < last and isinstance(tokens[last].offset, str)): last -= 1 return tokens[first].off2int( ) < jmp_target < tokens[last].off2int() pass pass pass elif rule == ("list_if_not", ("expr", "jmp_true", "list_iter")): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ("list_if", ("expr", "jmp_false", "list_iter")): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[ last].offset elif rule == ("or", ("expr", "jmp_true", "expr", "\\e_come_from_opt")): # Test that jmp_true doesn"t jump inside the middle the "or" # or that it jumps to the same place as the end of "and" jmp_true = ast[1][0] jmp_target = jmp_true.offset + jmp_true.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_true.pattr) elif (rule[0] == "whilestmt" and rule[1][0:-2] == ("SETUP_LOOP", "testexpr", "l_stmts_opt", "JUMP_BACK", "JUMP_BACK")): # Make sure that the jump backs all go to the same place i = last - 1 while (tokens[i] != "JUMP_BACK"): i -= 1 return tokens[i].attr != tokens[i - 1].attr elif rule[0] == "if_expr_true": return (first) > 0 and tokens[first - 1] == "POP_JUMP_IF_FALSE" return False
def find_jump_targets(self, debug): """ Detect all offsets in a byte code which are jump targets where we might insert a COME_FROM instruction. Return the list of offsets. Return the list of offsets. An instruction can be jumped to in from multiple instructions. """ code = self.code n = len(code) self.structs = [{'type': 'root', 'start': 0, 'end': n-1}] # All loop entry points self.loops = [] # Map fixed jumps to their real destination self.fixed_jumps = {} self.except_targets = {} self.ignore_if = set() self.build_statement_indices() self.else_start = {} # Containers filled by detect_control_flow() self.not_continue = set() self.return_end_ifs = set() self.setup_loop_targets = {} # target given setup_loop offset self.setup_loops = {} # setup_loop offset given target targets = {} extended_arg = 0 for offset in self.op_range(0, n): op = code[offset] if op == self.opc.EXTENDED_ARG: arg = code2num(code, offset+1) | extended_arg extended_arg = self.extended_arg_val(arg) continue # Determine structures and fix jumps in Python versions # since 2.3 self.detect_control_flow(offset, targets, extended_arg) has_arg = (op >= op3.HAVE_ARGUMENT) if has_arg: label = self.fixed_jumps.get(offset) if self.version >= 3.6: oparg = code[offset+1] else: oparg = code[offset+1] + code[offset+2] * 256 next_offset = xdis.next_offset(op, self.opc, offset) if label is None: if op in op3.hasjrel and op != self.opc.FOR_ITER: label = next_offset + oparg elif op in op3.hasjabs: if op in self.jump_if_pop: if oparg > offset: label = oparg if label is not None and label != -1: targets[label] = targets.get(label, []) + [offset] elif op == self.opc.END_FINALLY and offset in self.fixed_jumps: label = self.fixed_jumps[offset] targets[label] = targets.get(label, []) + [offset] pass extended_arg = 0 pass # for loop # DEBUG: if debug in ('both', 'after'): import pprint as pp pp.pprint(self.structs) return targets
def ingest(self, co, classname=None, code_objects={}, show_asm=None): """ Pick out tokens from an decompyle3 code object, and transform them, returning a list of decompyle3 Token's. The transformations are made to assist the deparsing grammar. Specificially: - various types of LOAD_CONST's are categorized in terms of what they load - COME_FROM instructions are added to assist parsing control structures - MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments - some EXTENDED_ARGS instructions are removed Also, when we encounter certain tokens, we add them to a set which will cause custom grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST cause specific rules for the specific number of arguments they take. """ def tokens_append(j, token): tokens.append(token) self.offset2tok_index[token.offset] = j j += 1 assert j == len(tokens) return j if not show_asm: show_asm = self.show_asm bytecode = self.build_instructions(co) # show_asm = 'both' if show_asm in ("both", "before"): for instr in bytecode.get_instructions(co): print(instr.disassemble()) # "customize" is in the process of going away here customize = {} if self.is_pypy: customize["PyPy"] = 0 # Scan for assertions. Later we will # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'. # 'LOAD_ASSERT' is used in assert statements. self.load_asserts = set() # list of tokens/instructions tokens = [] self.offset2tok_index = {} n = len(self.insts) for i, inst in enumerate(self.insts): # We need to detect the difference between: # raise AssertionError # and # assert ... # If we have a JUMP_FORWARD after the # RAISE_VARARGS then we have a "raise" statement # else we have an "assert" statement. assert_can_follow = inst.opname == "POP_JUMP_IF_TRUE" and i + 1 < n if assert_can_follow: next_inst = self.insts[i + 1] if ( next_inst.opname == "LOAD_GLOBAL" and next_inst.argval == "AssertionError" ): raise_idx = self.offset2inst_index[self.prev_op[inst.argval]] raise_inst = self.insts[raise_idx] if raise_inst.opname.startswith("RAISE_VARARGS"): self.load_asserts.add(next_inst.offset) pass pass # Operand values in Python wordcode are small. As a result, # there are these EXTENDED_ARG instructions - way more than # before 3.6. These parsing a lot of pain. # To simplify things we want to untangle this. We also # do this loop before we compute jump targets. for i, inst in enumerate(self.insts): # One artifact of the "too-small" operand problem, is that # some backward jumps, are turned into forward jumps to another # "extended arg" backward jump to the same location. if inst.opname == "JUMP_FORWARD": jump_inst = self.insts[self.offset2inst_index[inst.argval]] if jump_inst.has_extended_arg and jump_inst.opname.startswith("JUMP"): # Create comination of the jump-to instruction and # this one. Keep the position information of this instruction, # but the operator and operand properties come from the other # instruction self.insts[i] = Instruction( jump_inst.opname, jump_inst.opcode, jump_inst.optype, jump_inst.inst_size, jump_inst.arg, jump_inst.argval, jump_inst.argrepr, jump_inst.has_arg, inst.offset, inst.starts_line, inst.is_jump_target, inst.has_extended_arg, ) # Get jump targets # Format: {target offset: [jump offsets]} jump_targets = self.find_jump_targets(show_asm) # print("XXX2", jump_targets) last_op_was_break = False j = 0 for i, inst in enumerate(self.insts): argval = inst.argval op = inst.opcode if inst.opname == "EXTENDED_ARG": # FIXME: The EXTENDED_ARG is used to signal annotation # parameters if i + 1 < n and self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION: continue if inst.offset in jump_targets: jump_idx = 0 # We want to process COME_FROMs to the same offset to be in *descending* # offset order so we have the larger range or biggest instruction interval # last. (I think they are sorted in increasing order, but for safety # we sort them). That way, specific COME_FROM tags will match up # properly. For example, a "loop" with an "if" nested in it should have the # "loop" tag last so the grammar rule matches that properly. for jump_offset in sorted(jump_targets[inst.offset], reverse=True): come_from_name = "COME_FROM" opname = self.opname_for_offset(jump_offset) if opname == "EXTENDED_ARG": k = xdis.next_offset(op, self.opc, jump_offset) opname = self.opname_for_offset(k) if opname.startswith("SETUP_"): come_from_type = opname[len("SETUP_") :] come_from_name = "COME_FROM_%s" % come_from_type pass elif inst.offset in self.except_targets: come_from_name = "COME_FROM_EXCEPT_CLAUSE" j = tokens_append( j, Token( come_from_name, jump_offset, repr(jump_offset), offset="%s_%s" % (inst.offset, jump_idx), has_arg=True, opc=self.opc, has_extended_arg=False, ), ) jump_idx += 1 pass pass pattr = inst.argrepr opname = inst.opname if op in self.opc.CONST_OPS: const = argval if iscode(const): if const.co_name == "<lambda>": assert opname == "LOAD_CONST" opname = "LOAD_LAMBDA" elif const.co_name == "<genexpr>": opname = "LOAD_GENEXPR" elif const.co_name == "<dictcomp>": opname = "LOAD_DICTCOMP" elif const.co_name == "<setcomp>": opname = "LOAD_SETCOMP" elif const.co_name == "<listcomp>": opname = "LOAD_LISTCOMP" else: opname = "LOAD_CODE" # verify() uses 'pattr' for comparison, since 'attr' # now holds Code(const) and thus can not be used # for comparison (todo: think about changing this) # pattr = 'code_object @ 0x%x %s->%s' %\ # (id(const), const.co_filename, const.co_name) pattr = "<code_object " + const.co_name + ">" elif isinstance(const, str): opname = "LOAD_STR" else: if isinstance(inst.arg, int) and inst.arg < len(co.co_consts): argval, _ = _get_const_info(inst.arg, co.co_consts) # Why don't we use _ above for "pattr" rather than "const"? # This *is* a little hoaky, but we have to coordinate with # other parts like n_LOAD_CONST in pysource.py for example. pattr = const pass elif opname == "IMPORT_NAME": if "." in inst.argval: opname = "IMPORT_NAME_ATTR" pass elif opname in ("MAKE_FUNCTION", "MAKE_CLOSURE"): flags = argval opname = "MAKE_FUNCTION_%d" % (flags) attr = [] for flag in self.MAKE_FUNCTION_FLAGS: bit = flags & 1 attr.append(bit) flags >>= 1 attr = attr[:4] # remove last value: attr[5] == False j = tokens_append( j, Token( opname=opname, attr=attr, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc, has_extended_arg=inst.has_extended_arg, ), ) continue elif op in self.varargs_ops: pos_args = argval if self.is_pypy and not pos_args and opname == "BUILD_MAP": opname = "BUILD_MAP_n" else: opname = "%s_%d" % (opname, pos_args) elif self.is_pypy and opname == "JUMP_IF_NOT_DEBUG": # The value in the dict is in special cases in semantic actions, such # as JUMP_IF_NOT_DEBUG. The value is not used in these cases, so we put # in arbitrary value 0. customize[opname] = 0 elif opname == "UNPACK_EX": # FIXME: try with scanner and parser by # changing argval before_args = argval & 0xFF after_args = (argval >> 8) & 0xFF pattr = "%d before vararg, %d after" % (before_args, after_args) argval = (before_args, after_args) opname = "%s_%d+%d" % (opname, before_args, after_args) elif op == self.opc.JUMP_ABSOLUTE: # Refine JUMP_ABSOLUTE further in into: # # * "JUMP_BACK" - which are are used in loops. This is sometimes # found at the end of a looping construct # * "BREAK_LOOP" - which are are used to break loops. # * "CONTINUE" - jumps which may appear in a "continue" statement. # It is okay to confuse this with JUMP_BACK. The # grammar should tolerate this. # * "JUMP_FORWARD - forward jumps that are not BREAK_LOOP jumps. # # The loop-type and continue-type jumps will help us # classify loop boundaries The continue-type jumps # help us get "continue" statements with would # otherwise be turned into a "pass" statement because # JUMPs are sometimes ignored in rules as just # boundary overhead. Again, in comprehensions we might # sometimes classify JUMP_BACK as CONTINUE, but that's # okay since grammar rules should tolerate that. pattr = argval target = self.get_target(inst.offset) if target <= inst.offset: next_opname = self.insts[i + 1].opname # 'Continue's include jumps to loops that are not # and the end of a block which follow with POP_BLOCK and COME_FROM_LOOP. # If the JUMP_ABSOLUTE is to a FOR_ITER and it is followed by another JUMP_FORWARD # then we'll take it as a "continue". is_continue = ( self.insts[self.offset2inst_index[target]].opname == "FOR_ITER" and self.insts[i + 1].opname == "JUMP_FORWARD" ) if self.version < 3.8 and ( is_continue or ( inst.offset in self.stmts and ( inst.starts_line and next_opname not in self.not_continue_follow ) ) ): opname = "CONTINUE" else: opname = "JUMP_BACK" # FIXME: this is a hack to catch stuff like: # if x: continue # the "continue" is not on a new line. # There are other situations where we don't catch # CONTINUE as well. if tokens[-1].kind == "JUMP_BACK" and tokens[-1].attr <= argval: if tokens[-2].kind == "BREAK_LOOP": del tokens[-1] else: # intern is used because we are changing the *previous* token. # A POP_TOP suggests a "break" rather than a "continue"? if tokens[-2] == "POP_TOP": tokens[-1].kind = sys.intern("BREAK_LOOP") else: tokens[-1].kind = sys.intern("CONTINUE") pass pass pass if last_op_was_break and opname == "CONTINUE": last_op_was_break = False continue pass else: opname = "JUMP_FORWARD" elif opname.startswith("POP_JUMP_IF_") and not inst.jumps_forward(): opname += "_BACK" elif inst.offset in self.load_asserts: opname = "LOAD_ASSERT" last_op_was_break = opname == "BREAK_LOOP" j = tokens_append( j, Token( opname=opname, attr=argval, pattr=pattr, offset=inst.offset, linestart=inst.starts_line, op=op, has_arg=inst.has_arg, opc=self.opc, has_extended_arg=inst.has_extended_arg, ), ) pass if show_asm in ("both", "after"): for t in tokens: print(t.format(line_prefix="")) print() return tokens, customize
def detect_control_flow(self, offset, targets, inst_index): """ Detect type of block structures and their boundaries to fix optimized jumps in python2.3+ """ code = self.code inst = self.insts[inst_index] op = inst.opcode # Detect parent structure parent = self.structs[0] start = parent["start"] end = parent["end"] # Pick inner-most parent for our offset for struct in self.structs: current_start = struct["start"] current_end = struct["end"] if (current_start <= offset < current_end) and ( current_start >= start and current_end <= end): start = current_start end = current_end parent = struct if self.version < 3.8 and op == self.opc.SETUP_LOOP: # We categorize loop types: 'for', 'while', 'while 1' with # possibly suffixes '-loop' and '-else' # Try to find the jump_back instruction of the loop. # It could be a return instruction. start += inst.inst_size target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.setup_loops[target] = offset if target != end: self.fixed_jumps[offset] = end (line_no, next_line_byte) = self.lines[offset] jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False) if jump_back: jump_forward_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) else: jump_forward_offset = None return_val_offset1 = self.prev[self.prev[end]] if (jump_back and jump_back != self.prev_op[end] and self.is_jump_forward(jump_forward_offset)): if code[self.prev_op[end]] == self.opc.RETURN_VALUE or ( code[self.prev_op[end]] == self.opc.POP_BLOCK and code[return_val_offset1] == self.opc.RETURN_VALUE): jump_back = None if not jump_back: # loop suite ends in return jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE) if not jump_back: return jb_inst = self.get_inst(jump_back) jump_back = self.next_offset(jb_inst.opcode, jump_back) if_offset = None if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf: if_offset = self.prev[next_line_byte] if if_offset: loop_type = "while" self.ignore_if.add(if_offset) else: loop_type = "for" target = next_line_byte end = xdis.next_offset(code[jump_back], self.opc, jump_back) else: if self.get_target(jump_back) >= next_line_byte: jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, start, False) jb_inst = self.get_inst(jump_back) jb_next_offset = self.next_offset(jb_inst.opcode, jump_back) if end > jb_next_offset and self.is_jump_forward(end): if self.is_jump_forward(jb_next_offset): if self.get_target(jb_next_offset) == self.get_target( end): self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset elif target < offset: self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset target = self.get_target(jump_back) if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER): loop_type = "for" else: loop_type = "while" test = self.prev_op[next_line_byte] if test == offset: loop_type = "while 1" elif self.code[test] in self.opc.JUMP_OPs: self.ignore_if.add(test) test_target = self.get_target(test) if test_target > (jump_back + 3): jump_back = test_target self.not_continue.add(jump_back) self.loops.append(target) self.structs.append({ "type": loop_type + "-loop", "start": target, "end": jump_back }) after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) if after_jump_offset != end: self.structs.append({ "type": loop_type + "-else", "start": after_jump_offset, "end": end, }) elif op in self.pop_jump_tf: start = offset + inst.inst_size target = inst.argval rtarget = self.restrict_to_parent(target, parent) prev_op = self.prev_op # Do not let jump to go out of parent struct bounds if target != rtarget and parent["type"] == "and/or": self.fixed_jumps[offset] = rtarget return # Does this jump to right after another conditional jump that is # not myself? If so, it's part of a larger conditional. # rocky: if we have a conditional jump to the next instruction, then # possibly I am "skipping over" a "pass" or null statement. pretarget = self.get_inst(prev_op[target]) if (pretarget.opcode in self.pop_jump_if_pop and (target > offset) and pretarget.offset != offset): # FIXME: hack upon hack... # In some cases the pretarget can be a jump to the next instruction # and these aren't and/or's either. We limit to 3.5+ since we experienced there # but it might be earlier versions, or might be a general principle. if pretarget.argval != target: # FIXME: this is not accurate The commented out below # is what it should be. However grammar rules right now # assume the incorrect offsets. # self.fixed_jumps[offset] = target self.fixed_jumps[offset] = pretarget.offset self.structs.append({ "type": "and/or", "start": start, "end": pretarget.offset }) return # The opcode *two* instructions before the target jump offset is important # in making a determination of what we have. Save that. pre_rtarget = prev_op[rtarget] if op == self.opc.POP_JUMP_IF_FALSE: self.fixed_jumps[offset] = target # op == POP_JUMP_IF_TRUE else: next = self.next_stmt[offset] if prev_op[next] == offset: pass elif self.is_jump_forward(next) and target == self.get_target( next): if code[prev_op[next]] == self.opc.POP_JUMP_IF_FALSE: if (code[next] == self.opc.JUMP_FORWARD or target != rtarget or code[prev_op[pre_rtarget]] not in (self.opc.JUMP_ABSOLUTE, self.opc.RETURN_VALUE)): self.fixed_jumps[offset] = prev_op[next] return elif (code[next] == self.opc.JUMP_ABSOLUTE and self.is_jump_forward(target) and self.get_target(target) == self.get_target(next)): self.fixed_jumps[offset] = prev_op[next] return rtarget_is_ja = code[pre_rtarget] == self.opc.JUMP_ABSOLUTE if (rtarget_is_ja and pre_rtarget in self.stmts and pre_rtarget != offset and prev_op[pre_rtarget] != offset and not (code[rtarget] == self.opc.JUMP_ABSOLUTE and code[rtarget + 3] == self.opc.POP_BLOCK and code[prev_op[pre_rtarget]] != self.opc.JUMP_ABSOLUTE)): rtarget = pre_rtarget # Does the "jump if" jump beyond a jump op? # That is, we have something like: # POP_JUMP_IF_FALSE HERE # ... # JUMP_FORWARD # HERE: # # If so, this can be block inside an "if" statement # or a conditional assignment like: # x = 1 if x else 2 # # For 3.5, for JUMP_FORWARD above we could have also # JUMP_BACK or CONTINUE # # There are other situations we may need to consider, like # if the condition jump is to a forward location. # Also the existence of a jump to the instruction after "END_FINALLY" # will distinguish "try/else" from "try". rtarget_break = (self.opc.RETURN_VALUE, self.opc.BREAK_LOOP) if self.is_jump_forward(pre_rtarget) or (rtarget_is_ja): if_end = self.get_target(pre_rtarget) # If the jump target is back, we are looping if (if_end < pre_rtarget and self.version < 3.8 and (code[prev_op[if_end]] == self.opc.SETUP_LOOP)): if if_end > start: return end = self.restrict_to_parent(if_end, parent) self.structs.append({ "type": "if-then", "start": start, "end": pre_rtarget }) # FIXME: add this # self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) if rtarget < end and ( code[rtarget] not in (self.opc.END_FINALLY, self.opc.JUMP_ABSOLUTE) and code[prev_op[pre_rtarget]] not in (self.opc.POP_EXCEPT, self.opc.END_FINALLY)): self.structs.append({ "type": "else", "start": rtarget, "end": end }) self.else_start[rtarget] = end elif self.is_jump_back(pre_rtarget, 0): if_end = rtarget self.structs.append({ "type": "if-then", "start": start, "end": pre_rtarget }) self.not_continue.add(pre_rtarget) elif code[pre_rtarget] in rtarget_break: self.structs.append({ "type": "if-then", "start": start, "end": rtarget }) # It is important to distingish if this return is inside some sort # except block return jump_prev = prev_op[offset] if self.is_pypy and code[jump_prev] == self.opc.COMPARE_OP: if self.opc.cmp_op[code[jump_prev + 1]] == "exception-match": return pass # Check that next instruction after pops and jump is # not from SETUP_EXCEPT next_op = rtarget if code[next_op] == self.opc.POP_BLOCK: next_op += instruction_size(self.code[next_op], self.opc) if code[next_op] == self.opc.JUMP_ABSOLUTE: next_op += instruction_size(self.code[next_op], self.opc) if next_op in targets: for try_op in targets[next_op]: come_from_op = code[try_op] if self.version < 3.8 and come_from_op == self.opc.SETUP_EXCEPT: return pass self.fixed_jumps[offset] = rtarget if code[pre_rtarget] == self.opc.RETURN_VALUE: # If we are at some sort of POP_JUMP_IF and the instruction before was # COMPARE_OP exception-match, then pre_rtarget is not an end_if if not (inst_index > 0 and self.insts[inst_index - 1].argval == "exception-match"): self.return_end_ifs.add(pre_rtarget) else: self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) else: if target > offset: self.fixed_jumps[offset] = target pass elif self.version < 3.8 and op == self.opc.SETUP_EXCEPT: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op == self.opc.POP_EXCEPT: next_offset = xdis.next_offset(op, self.opc, offset) target = self.get_target(next_offset) if target > next_offset: next_op = code[next_offset] if (self.opc.JUMP_ABSOLUTE == next_op and self.opc.END_FINALLY != code[xdis.next_offset( next_op, self.opc, next_offset)]): self.fixed_jumps[next_offset] = target self.except_targets[target] = next_offset elif op == self.opc.SETUP_FINALLY: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op in self.jump_if_pop: target = self.get_target(offset) if target > offset: unop_target = self.last_instr(offset, target, self.opc.JUMP_FORWARD, target) if unop_target and code[unop_target + 3] != self.opc.ROT_TWO: self.fixed_jumps[offset] = unop_target else: self.fixed_jumps[offset] = self.restrict_to_parent( target, parent) pass pass else: # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get # misclassified as RETURN_END_IF. Handle that here. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF if op == self.opc.RETURN_VALUE: next_offset = xdis.next_offset(op, self.opc, offset) if next_offset < len(code) and ( code[next_offset] == self.opc.JUMP_ABSOLUTE and offset in self.return_end_ifs): self.return_end_ifs.remove(offset) pass pass elif op == self.opc.JUMP_FORWARD: # If we have: # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x: # then RETURN_VALUE is not RETURN_END_IF rtarget = self.get_target(offset) rtarget_prev = self.prev[rtarget] if (code[rtarget_prev] == self.opc.RETURN_VALUE and rtarget_prev in self.return_end_ifs): i = rtarget_prev while i != offset: if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]: return i = self.prev[i] self.return_end_ifs.remove(rtarget_prev) pass return
def find_jump_targets(self, debug): """ Detect all offsets in a byte code which are jump targets where we might insert a COME_FROM instruction. Return the list of offsets. Return the list of offsets. An instruction can be jumped to in from multiple instructions. """ code = self.code n = len(code) self.structs = [{"type": "root", "start": 0, "end": n - 1}] # All loop entry points self.loops = [] # Map fixed jumps to their real destination self.fixed_jumps = {} self.except_targets = {} self.ignore_if = set() self.build_statement_indices() self.else_start = {} # Containers filled by detect_control_flow() self.not_continue = set() self.return_end_ifs = set() self.setup_loop_targets = {} # target given setup_loop offset self.setup_loops = {} # setup_loop offset given target targets = {} for i, inst in enumerate(self.insts): offset = inst.offset op = inst.opcode # Determine structures and fix jumps in Python versions # since 2.3 self.detect_control_flow(offset, targets, i) if inst.has_arg: label = self.fixed_jumps.get(offset) oparg = inst.arg if self.code[offset] == self.opc.EXTENDED_ARG: j = xdis.next_offset(op, self.opc, offset) next_offset = xdis.next_offset(op, self.opc, j) else: next_offset = xdis.next_offset(op, self.opc, offset) if label is None: if op in self.opc.hasjrel and op != self.opc.FOR_ITER: label = next_offset + oparg elif op in self.opc.hasjabs: if op in self.jump_if_pop: if oparg > offset: label = oparg if label is not None and label != -1: targets[label] = targets.get(label, []) + [offset] elif op == self.opc.END_FINALLY and offset in self.fixed_jumps: label = self.fixed_jumps[offset] targets[label] = targets.get(label, []) + [offset] pass pass # for loop # DEBUG: if debug in ("both", "after"): import pprint as pp pp.pprint(self.structs) return targets
def basic_blocks(version, is_pypy, fn, first_line=None): """Create a list of basic blocks found in a code object """ BB = BBMgr(version, is_pypy) # Get jump targets jump_targets = set() instructions = list(get_instructions(fn, first_line=first_line)) for inst in instructions: op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if op in BB.JUMP_INSTRUCTIONS: if op in BB.JABS_INSTRUCTIONS: jump_offset = inst.arg else: jump_offset = follow_offset + inst.arg jump_targets.add(jump_offset) pass start_offset = 0 end_offset = -1 jump_offsets = set() prev_offset = -1 endloop_offsets = [-1] flags = set([BB_ENTRY]) end_try_offset_stack = [] try_stack = [] end_try_offset = None loop_offset = None for i, inst in enumerate(instructions): prev_offset = end_offset end_offset = inst.offset op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if offset == end_try_offset: if len(end_try_offset_stack): end_try_offset = end_try_offset_stack[-1] end_try_offset_stack.pop() else: end_try_offset = None if op in BB.LOOP_INSTRUCTIONS: jump_offset = follow_offset + inst.arg endloop_offsets.append(jump_offset) loop_offset = offset elif offset == endloop_offsets[-1]: endloop_offsets.pop() pass if op in BB.LOOP_INSTRUCTIONS: flags.add(BB_LOOP) elif op in BB.BREAK_INSTRUCTIONS: flags.add(BB_BREAK) jump_offsets.add(endloop_offsets[-1]) block, flags, jump_offsets = BB.add_bb(start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) start_offset = follow_offset if offset in jump_targets: # Fallthrough path and jump target path. # This instruction definitely starts a new basic block # Close off any prior basic block if start_offset < end_offset: block, flags, jump_offsets = BB.add_bb(start_offset, prev_offset, loop_offset, end_offset, flags, jump_offsets) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) pass start_offset = end_offset pass # Add block flags for certain classes of instructions if op in BB.JUMP_CONDITONAL: flags.add(BB_JUMP_CONDITIONAL) if op in BB.POP_BLOCK_INSTRUCTIONS: flags.add(BB_POP_BLOCK) if start_offset == offset: flags.add(BB_STARTS_POP_BLOCK) flags.remove(BB_POP_BLOCK) elif op in BB.EXCEPT_INSTRUCTIONS: if (sys.version_info[0:2] <= (2, 7)): # In Python up to 2.7, thre'POP_TOP'S at the beginning of a block # indicate an exception handler. We also check # that we are nested inside a "try". if len(try_stack) == 0 or start_offset != offset: continue pass if (instructions[i+1].opcode != BB.opcode.opmap['POP_TOP'] or instructions[i+2].opcode != BB.opcode.opmap['POP_TOP']): continue flags.add(BB_EXCEPT) try_stack[-1].exception_offsets.add(start_offset) pass elif op in BB.TRY_INSTRUCTIONS: end_try_offset_stack.append(inst.argval) flags.add(BB_TRY) elif op in BB.END_FINALLY_INSTRUCTIONS: flags.add(BB_END_FINALLY) try_stack[-1].exception_offsets.add(start_offset) elif op in BB.FOR_INSTRUCTIONS: flags.add(BB_FOR) jump_offsets.add(inst.argval) block, flags, jump_offsets = BB.add_bb(start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets) loop_offset = None start_offset = follow_offset elif op in BB.JUMP_INSTRUCTIONS: # Some sort of jump instruction. # Figure out where we jump to amd add it to this # basic block's jump offsets. if op in BB.JABS_INSTRUCTIONS: jump_offset = inst.arg else: jump_offset = inst.argval jump_offsets.add(jump_offset) if op in BB.JUMP_UNCONDITONAL: flags.add(BB_JUMP_UNCONDITIONAL) if jump_offset == follow_offset: flags.add(BB_JUMP_TO_FALLTHROUGH) pass block, flags, jump_offsets = BB.add_bb(start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) pass start_offset = follow_offset elif op != BB.opcode.SETUP_LOOP: if op in BB.FINALLY_INSTRUCTIONS: flags.add(BB_FINALLY) block, flags, jump_offsets = BB.add_bb(start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) start_offset = follow_offset pass elif op in BB.NOFOLLOW_INSTRUCTIONS: flags.add(BB_NOFOLLOW) last_block, flags, jump_offsets = BB.add_bb(start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets) loop_offset = None start_offset = follow_offset pass pass if len(BB.bb_list): BB.bb_list[-1].follow_offset = None BB.start_block = BB.bb_list[0] # Add remaining instructions? if start_offset <= end_offset: BB.bb_list.append(BasicBlock(start_offset, end_offset, loop_offset, None, flags=flags, jump_offsets=jump_offsets)) loop_offset = None pass # Add an artificial block where we can link the exits of other blocks # to. This helps in computing reverse dominators. BB.add_bb(end_offset+1, end_offset+1, None, None, set([BB_EXIT]), []) return BB
def reduce_is_invalid(self, rule, ast, tokens, first, last): invalid = super(Python27Parser, self).reduce_is_invalid(rule, ast, tokens, first, last) lhs = rule[0] n = len(tokens) fn = self.reduce_check_table.get(lhs, None) if fn: invalid = fn(self, lhs, n, rule, ast, tokens, first, last) last = min(last, n-1) if invalid: return invalid if rule == ("comp_if", ("expr", "jmp_false", "comp_iter")): jmp_false = ast[1] if jmp_false[0] == "POP_JUMP_IF_FALSE": return tokens[first].offset < jmp_false[0].attr < tokens[last].offset pass elif (rule[0], rule[1][0:5]) == ( "if_exp", ("expr", "jmp_false", "expr", "JUMP_ABSOLUTE", "expr")): jmp_false = ast[1] if jmp_false[0] == "POP_JUMP_IF_FALSE": else_instr = ast[4].first_child() if jmp_false[0].attr != else_instr.offset: return True end_offset = ast[3].attr return end_offset < tokens[last].offset pass elif rule[0] == ("raise_stmt1"): return ast[0] == "expr" and ast[0][0] == "or" elif rule[0] in ("assert", "assert2"): jump_inst = ast[1][0] jump_target = jump_inst.attr return not (last >= len(tokens) or jump_target == tokens[last].offset or jump_target == next_offset(ast[-1].op, ast[-1].opc, ast[-1].offset)) elif rule == ("ifstmt", ("testexpr", "_ifstmts_jump")): for i in range(last-1, last-4, -1): t = tokens[i] if t == "JUMP_FORWARD": return t.attr > tokens[min(last, len(tokens)-1)].off2int() elif t not in ("POP_TOP", "COME_FROM"): break pass pass elif rule == ("iflaststmtl", ("testexpr", "c_stmts")): testexpr = ast[0] if testexpr[0] in ("testfalse", "testtrue"): test = testexpr[0] if len(test) > 1 and test[1].kind.startswith("jmp_"): jmp_target = test[1][0].attr if last == len(tokens): last -= 1 while (isinstance(tokens[first].offset, str) and first < last): first += 1 if first == last: return True while (first < last and isinstance(tokens[last].offset, str)): last -= 1 return tokens[first].off2int() < jmp_target < tokens[last].off2int() pass pass pass elif rule == ("list_if_not", ("expr", "jmp_true", "list_iter")): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[last].offset elif rule == ("list_if", ("expr", "jmp_false", "list_iter")): jump_inst = ast[1][0] jump_offset = jump_inst.attr return jump_offset > jump_inst.offset and jump_offset < tokens[last].offset elif rule == ("or", ("expr", "jmp_true", "expr", "\\e_come_from_opt")): # Test that jmp_true doesn"t jump inside the middle the "or" # or that it jumps to the same place as the end of "and" jmp_true = ast[1][0] jmp_target = jmp_true.offset + jmp_true.attr + 3 return not (jmp_target == tokens[last].offset or tokens[last].pattr == jmp_true.pattr) elif (rule[0] == "whilestmt" and rule[1][0:-2] == ("SETUP_LOOP", "testexpr", "l_stmts_opt", "JUMP_BACK", "JUMP_BACK")): # Make sure that the jump backs all go to the same place i = last-1 while (tokens[i] != "JUMP_BACK"): i -= 1 return tokens[i].attr != tokens[i-1].attr elif rule[0] == "if_exp_true": return (first) > 0 and tokens[first-1] == "POP_JUMP_IF_FALSE" return False
def detect_control_flow(self, offset, targets, inst_index): """ Detect type of block structures and their boundaries to fix optimized jumps in python2.3+ """ code = self.code inst = self.insts[inst_index] op = inst.opcode # Detect parent structure parent = self.structs[0] start = parent['start'] end = parent['end'] # Pick inner-most parent for our offset for struct in self.structs: current_start = struct['start'] current_end = struct['end'] if ((current_start <= offset < current_end) and (current_start >= start and current_end <= end)): start = current_start end = current_end parent = struct if self.version < 3.8 and op == self.opc.SETUP_LOOP: # We categorize loop types: 'for', 'while', 'while 1' with # possibly suffixes '-loop' and '-else' # Try to find the jump_back instruction of the loop. # It could be a return instruction. start += inst.inst_size target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.setup_loops[target] = offset if target != end: self.fixed_jumps[offset] = end (line_no, next_line_byte) = self.lines[offset] jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False) if jump_back: jump_forward_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) else: jump_forward_offset = None return_val_offset1 = self.prev[self.prev[end]] if (jump_back and jump_back != self.prev_op[end] and self.is_jump_forward(jump_forward_offset)): if (code[self.prev_op[end]] == self.opc.RETURN_VALUE or (code[self.prev_op[end]] == self.opc.POP_BLOCK and code[return_val_offset1] == self.opc.RETURN_VALUE)): jump_back = None if not jump_back: # loop suite ends in return jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE) if not jump_back: return jb_inst = self.get_inst(jump_back) jump_back = self.next_offset(jb_inst.opcode, jump_back) if_offset = None if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf: if_offset = self.prev[next_line_byte] if if_offset: loop_type = 'while' self.ignore_if.add(if_offset) else: loop_type = 'for' target = next_line_byte end = xdis.next_offset(code[jump_back], self.opc, jump_back) else: if self.get_target(jump_back) >= next_line_byte: jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, start, False) jb_inst = self.get_inst(jump_back) jb_next_offset = self.next_offset(jb_inst.opcode, jump_back) if end > jb_next_offset and self.is_jump_forward(end): if self.is_jump_forward(jb_next_offset): if self.get_target(jb_next_offset) == self.get_target(end): self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset elif target < offset: self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset target = self.get_target(jump_back) if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER): loop_type = 'for' else: loop_type = 'while' test = self.prev_op[next_line_byte] if test == offset: loop_type = 'while 1' elif self.code[test] in self.opc.JUMP_OPs: self.ignore_if.add(test) test_target = self.get_target(test) if test_target > (jump_back+3): jump_back = test_target self.not_continue.add(jump_back) self.loops.append(target) self.structs.append({'type': loop_type + '-loop', 'start': target, 'end': jump_back}) after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) if after_jump_offset != end: self.structs.append({'type': loop_type + '-else', 'start': after_jump_offset, 'end': end}) elif op in self.pop_jump_tf: start = offset + inst.inst_size target = inst.argval rtarget = self.restrict_to_parent(target, parent) prev_op = self.prev_op # Do not let jump to go out of parent struct bounds if target != rtarget and parent['type'] == 'and/or': self.fixed_jumps[offset] = rtarget return # Does this jump to right after another conditional jump that is # not myself? If so, it's part of a larger conditional. # rocky: if we have a conditional jump to the next instruction, then # possibly I am "skipping over" a "pass" or null statement. pretarget = self.get_inst(prev_op[target]) if (pretarget.opcode in self.pop_jump_if_pop and (target > offset) and pretarget.offset != offset): # FIXME: hack upon hack... # In some cases the pretarget can be a jump to the next instruction # and these aren't and/or's either. We limit to 3.5+ since we experienced there # but it might be earlier versions, or might be a general principle. if self.version < 3.5 or pretarget.argval != target: # FIXME: this is not accurate The commented out below # is what it should be. However grammar rules right now # assume the incorrect offsets. # self.fixed_jumps[offset] = target self.fixed_jumps[offset] = pretarget.offset self.structs.append({'type': 'and/or', 'start': start, 'end': pretarget.offset}) return # The opcode *two* instructions before the target jump offset is important # in making a determination of what we have. Save that. pre_rtarget = prev_op[rtarget] # Is it an "and" inside an "if" or "while" block if op == self.opc.POP_JUMP_IF_FALSE: # Search for another POP_JUMP_IF_FALSE targetting the same op, # in current statement, starting from current offset, and filter # everything inside inner 'or' jumps and midline ifs match = self.rem_or(start, self.next_stmt[offset], self.opc.POP_JUMP_IF_FALSE, target) # If we still have any offsets in set, start working on it if match: is_jump_forward = self.is_jump_forward(pre_rtarget) if (is_jump_forward and pre_rtarget not in self.stmts and self.restrict_to_parent(self.get_target(pre_rtarget), parent) == rtarget): if (code[prev_op[pre_rtarget]] == self.opc.JUMP_ABSOLUTE and self.remove_mid_line_ifs([offset]) and target == self.get_target(prev_op[pre_rtarget]) and (prev_op[pre_rtarget] not in self.stmts or self.get_target(prev_op[pre_rtarget]) > prev_op[pre_rtarget]) and 1 == len(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], self.pop_jump_tf, target)))): pass elif (code[prev_op[pre_rtarget]] == self.opc.RETURN_VALUE and self.remove_mid_line_ifs([offset]) and 1 == (len(set(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], self.pop_jump_tf, target))) | set(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], (self.opc.POP_JUMP_IF_FALSE, self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_ABSOLUTE), pre_rtarget, True)))))): pass else: fix = None jump_ifs = self.inst_matches(start, self.next_stmt[offset], self.opc.POP_JUMP_IF_FALSE) last_jump_good = True for j in jump_ifs: if target == self.get_target(j): # FIXME: remove magic number if self.lines[j].next == j + 3 and last_jump_good: fix = j break else: last_jump_good = False self.fixed_jumps[offset] = fix or match[-1] return else: if self.version < 3.6: # FIXME: this is putting in COME_FROMs in the wrong place. # Fix up grammar so we don't need to do this. # See cf_for_iter use in parser36.py self.fixed_jumps[offset] = match[-1] elif target > offset: # Right now we only add COME_FROMs in forward (not loop) jumps self.fixed_jumps[offset] = target return # op == POP_JUMP_IF_TRUE else: next = self.next_stmt[offset] if prev_op[next] == offset: pass elif self.is_jump_forward(next) and target == self.get_target(next): if code[prev_op[next]] == self.opc.POP_JUMP_IF_FALSE: if (code[next] == self.opc.JUMP_FORWARD or target != rtarget or code[prev_op[pre_rtarget]] not in (self.opc.JUMP_ABSOLUTE, self.opc.RETURN_VALUE)): self.fixed_jumps[offset] = prev_op[next] return elif (code[next] == self.opc.JUMP_ABSOLUTE and self.is_jump_forward(target) and self.get_target(target) == self.get_target(next)): self.fixed_jumps[offset] = prev_op[next] return # Don't add a struct for a while test, it's already taken care of if offset in self.ignore_if: return rtarget_is_ja = code[pre_rtarget] == self.opc.JUMP_ABSOLUTE if ( rtarget_is_ja and pre_rtarget in self.stmts and pre_rtarget != offset and prev_op[pre_rtarget] != offset and not (code[rtarget] == self.opc.JUMP_ABSOLUTE and code[rtarget+3] == self.opc.POP_BLOCK and code[prev_op[pre_rtarget]] != self.opc.JUMP_ABSOLUTE)): rtarget = pre_rtarget # Does the "jump if" jump beyond a jump op? # That is, we have something like: # POP_JUMP_IF_FALSE HERE # ... # JUMP_FORWARD # HERE: # # If so, this can be block inside an "if" statement # or a conditional assignment like: # x = 1 if x else 2 # # For 3.5, in addition the JUMP_FORWARD above we could have # JUMP_BACK or CONTINUE # # There are other situations we may need to consider, like # if the condition jump is to a forward location. # Also the existence of a jump to the instruction after "END_FINALLY" # will distinguish "try/else" from "try". if self.version < 3.8: rtarget_break = (self.opc.RETURN_VALUE, self.opc.BREAK_LOOP) else: rtarget_break = (self.opc.RETURN_VALUE,) if self.is_jump_forward(pre_rtarget) or (rtarget_is_ja and self.version >= 3.5): if_end = self.get_target(pre_rtarget) # If the jump target is back, we are looping if (if_end < pre_rtarget and self.version < 3.8 and (code[prev_op[if_end]] == self.opc.SETUP_LOOP)): if (if_end > start): return end = self.restrict_to_parent(if_end, parent) self.structs.append({'type': 'if-then', 'start': start, 'end': pre_rtarget}) # FIXME: add this # self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) if rtarget < end and ( code[rtarget] not in (self.opc.END_FINALLY, self.opc.JUMP_ABSOLUTE) and code[prev_op[pre_rtarget]] not in (self.opc.POP_EXCEPT, self.opc.END_FINALLY)): self.structs.append({'type': 'else', 'start': rtarget, 'end': end}) self.else_start[rtarget] = end elif self.is_jump_back(pre_rtarget, 0): if_end = rtarget self.structs.append({'type': 'if-then', 'start': start, 'end': pre_rtarget}) self.not_continue.add(pre_rtarget) elif code[pre_rtarget] in rtarget_break: self.structs.append({'type': 'if-then', 'start': start, 'end': rtarget}) # It is important to distingish if this return is inside some sort # except block return jump_prev = prev_op[offset] if self.is_pypy and code[jump_prev] == self.opc.COMPARE_OP: if self.opc.cmp_op[code[jump_prev+1]] == 'exception-match': return if self.version >= 3.5: # Python 3.5 may remove as dead code a JUMP # instruction after a RETURN_VALUE. So we check # based on seeing SETUP_EXCEPT various places. if self.version < 3.6 and code[rtarget] == self.opc.SETUP_EXCEPT: return # Check that next instruction after pops and jump is # not from SETUP_EXCEPT next_op = rtarget if code[next_op] == self.opc.POP_BLOCK: next_op += instruction_size(self.code[next_op], self.opc) if code[next_op] == self.opc.JUMP_ABSOLUTE: next_op += instruction_size(self.code[next_op], self.opc) if next_op in targets: for try_op in targets[next_op]: come_from_op = code[try_op] if self.version < 3.8 and come_from_op == self.opc.SETUP_EXCEPT: return pass pass if self.version >= 3.4: self.fixed_jumps[offset] = rtarget if code[pre_rtarget] == self.opc.RETURN_VALUE: # If we are at some sort of POP_JUMP_IF and the instruction before was # COMPARE_OP exception-match, then pre_rtarget is not an end_if if not (inst_index > 0 and self.insts[inst_index-1].argval == 'exception-match'): self.return_end_ifs.add(pre_rtarget) else: self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) else: # FIXME: this is very convoluted and based on rather hacky # empirical evidence. It should go a way when # we have better control-flow analysis normal_jump = self.version >= 3.6 if self.version == 3.5: j = self.offset2inst_index[target] if j+2 < len(self.insts) and self.insts[j+2].is_jump_target: normal_jump = self.insts[j+1].opname == 'POP_BLOCK' if normal_jump: # For now, we'll only tag forward jump. if target > offset: self.fixed_jumps[offset] = target pass else: # FIXME: This is probably a bug in < 3.5 and we should # instead use the above code. But until we smoke things # out we'll stick with it. if rtarget > offset: self.fixed_jumps[offset] = rtarget elif self.version < 3.8 and op == self.opc.SETUP_EXCEPT: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op == self.opc.POP_EXCEPT: next_offset = xdis.next_offset(op, self.opc, offset) target = self.get_target(next_offset) if target > next_offset: next_op = code[next_offset] if (self.opc.JUMP_ABSOLUTE == next_op and self.opc.END_FINALLY != code[xdis.next_offset(next_op, self.opc, next_offset)]): self.fixed_jumps[next_offset] = target self.except_targets[target] = next_offset elif op == self.opc.SETUP_FINALLY: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op in self.jump_if_pop: target = self.get_target(offset) if target > offset: unop_target = self.last_instr(offset, target, self.opc.JUMP_FORWARD, target) if unop_target and code[unop_target+3] != self.opc.ROT_TWO: self.fixed_jumps[offset] = unop_target else: self.fixed_jumps[offset] = self.restrict_to_parent(target, parent) pass pass elif self.version >= 3.5: # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get # misclassified as RETURN_END_IF. Handle that here. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF if op == self.opc.RETURN_VALUE: next_offset = xdis.next_offset(op, self.opc, offset) if ( next_offset < len(code) and (code[next_offset] == self.opc.JUMP_ABSOLUTE and offset in self.return_end_ifs) ): self.return_end_ifs.remove(offset) pass pass elif op == self.opc.JUMP_FORWARD: # If we have: # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x: # then RETURN_VALUE is not RETURN_END_IF rtarget = self.get_target(offset) rtarget_prev = self.prev[rtarget] if (code[rtarget_prev] == self.opc.RETURN_VALUE and rtarget_prev in self.return_end_ifs): i = rtarget_prev while i != offset: if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]: return i = self.prev[i] self.return_end_ifs.remove(rtarget_prev) pass return
def detect_control_flow( self, offset: int, targets: Dict[Any, Any], inst_index: int ): """ Detect type of block structures and their boundaries to fix optimized jumps in python2.3+ """ code = self.code inst = self.insts[inst_index] op = inst.opcode # Detect parent structure parent: Dict[str, Any] = self.structs[0] start: int = parent["start"] end: int = parent["end"] # Pick inner-most parent for our offset for struct in self.structs: current_start = struct["start"] current_end = struct["end"] if (current_start <= offset < current_end) and ( current_start >= start and current_end <= end ): start = current_start end = current_end parent = struct if self.version < 3.8 and op == self.opc.SETUP_LOOP: # We categorize loop types: 'for', 'while', 'while 1' with # possibly suffixes '-loop' and '-else' # Try to find the jump_back instruction of the loop. # It could be a return instruction. start += inst.inst_size target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.setup_loops[target] = offset if target != end: self.fixed_jumps[offset] = end (line_no, next_line_byte) = self.lines[offset] jump_back = self.last_instr( start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False ) if jump_back: jump_forward_offset = xdis.next_offset( code[jump_back], self.opc, jump_back ) else: jump_forward_offset = None return_val_offset1 = self.prev[self.prev[end]] if ( jump_back and jump_back != self.prev_op[end] and self.is_jump_forward(jump_forward_offset) ): if code[self.prev_op[end]] == self.opc.RETURN_VALUE or ( code[self.prev_op[end]] == self.opc.POP_BLOCK and code[return_val_offset1] == self.opc.RETURN_VALUE ): jump_back = None if not jump_back: # loop suite ends in return jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE) if not jump_back: return jb_inst = self.get_inst(jump_back) jump_back = self.next_offset(jb_inst.opcode, jump_back) if_offset = None if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf: if_offset = self.prev[next_line_byte] if if_offset: loop_type = "while" self.ignore_if.add(if_offset) else: loop_type = "for" target = next_line_byte end = xdis.next_offset(code[jump_back], self.opc, jump_back) else: if self.get_target(jump_back) >= next_line_byte: jump_back = self.last_instr( start, end, self.opc.JUMP_ABSOLUTE, start, False ) jb_inst = self.get_inst(jump_back) jb_next_offset = self.next_offset(jb_inst.opcode, jump_back) if end > jb_next_offset and self.is_jump_forward(end): if self.is_jump_forward(jb_next_offset): if self.get_target(jb_next_offset) == self.get_target(end): self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset elif target < offset: self.fixed_jumps[offset] = jb_next_offset end = jb_next_offset target = self.get_target(jump_back) if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER): loop_type = "for" else: loop_type = "while" test = self.prev_op[next_line_byte] if test == offset: loop_type = "while 1" elif self.code[test] in self.opc.JUMP_OPs: self.ignore_if.add(test) test_target = self.get_target(test) if test_target > (jump_back + 3): jump_back = test_target self.not_continue.add(jump_back) self.loops.append(target) self.structs.append( {"type": loop_type + "-loop", "start": target, "end": jump_back} ) after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) if after_jump_offset != end: self.structs.append( { "type": loop_type + "-else", "start": after_jump_offset, "end": end, } ) elif op in self.pop_jump_tf: target = inst.argval self.fixed_jumps[offset] = target # FIXME: consider removing the test on 3.8. elif self.version >= 3.8 and inst.is_jump(): self.fixed_jumps[offset] = inst.argval elif self.version < 3.8 and op == self.opc.SETUP_EXCEPT: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif self.version < 3.8 and op == self.opc.POP_EXCEPT: next_offset = xdis.next_offset(op, self.opc, offset) target = self.get_target(next_offset) if target > next_offset: next_op = code[next_offset] if ( self.opc.JUMP_ABSOLUTE == next_op and self.opc.END_FINALLY != code[xdis.next_offset(next_op, self.opc, next_offset)] ): self.fixed_jumps[next_offset] = target self.except_targets[target] = next_offset elif op == self.opc.SETUP_FINALLY: target = self.get_target(offset) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op in self.jump_if_pop: target = self.get_target(offset) if target > offset: unop_target = self.last_instr( offset, target, self.opc.JUMP_FORWARD, target ) if unop_target and code[unop_target + 3] != self.opc.ROT_TWO: self.fixed_jumps[offset] = unop_target else: self.fixed_jumps[offset] = self.restrict_to_parent(target, parent) pass pass else: # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get # misclassified as RETURN_END_IF. Handle that here. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF if op == self.opc.RETURN_VALUE: next_offset = xdis.next_offset(op, self.opc, offset) if next_offset < len(code) and ( code[next_offset] == self.opc.JUMP_ABSOLUTE and offset in self.return_end_ifs ): self.return_end_ifs.remove(offset) pass pass elif op == self.opc.JUMP_FORWARD: # If we have: # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x: # then RETURN_VALUE is not RETURN_END_IF rtarget = self.get_target(offset) rtarget_prev = self.prev[rtarget] if ( code[rtarget_prev] == self.opc.RETURN_VALUE and rtarget_prev in self.return_end_ifs ): i = rtarget_prev while i != offset: if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]: return i = self.prev[i] self.return_end_ifs.remove(rtarget_prev) pass return
def next_offset(self, op, offset): return xdis.next_offset(op, self.opc, offset)
def parse_byte_and_args(self, byte_code, replay=False): """ Parse 1 - 3 bytes of bytecode into an instruction and optionally arguments. Argument replay is used to handle breakpoints. """ f = self.frame f_code = f.f_code co_code = f_code.co_code extended_arg = 0 # Note: There is never more than one argument. # The list size is used to indicate whether an argument # exists or not. # FIMXE: remove and use int_arg as a indicator of whether # the argument exists. arguments = [] int_arg = None while True: if f.fallthrough: if not replay: f.f_lasti = next_offset(byte_code, self.opc, f.f_lasti) else: # Jump instructions must set this False. f.fallthrough = True offset = f.f_lasti line_number = self.frame.linestarts.get(offset, None) if line_number is not None: f.f_lineno = line_number if not replay: byte_code = byteint(co_code[offset]) byte_name = self.opc.opname[byte_code] arg_offset = offset + 1 arg = None if op_has_argument(byte_code, self.opc): if self.version >= 3.6: int_arg = code2num(co_code, arg_offset) | extended_arg # Note: Python 3.6.0a1 is 2, for 3.6.a3 and beyond we have 1 arg_offset += 1 if byte_code == self.opc.EXTENDED_ARG: extended_arg = int_arg << 8 continue else: extended_arg = 0 else: int_arg = (code2num(co_code, arg_offset) + code2num(co_code, arg_offset + 1) * 256 + extended_arg) arg_offset += 2 if byte_code == self.opc.EXTENDED_ARG: extended_arg = int_arg * 65536 continue else: extended_arg = 0 if byte_code in self.opc.CONST_OPS: arg = f_code.co_consts[int_arg] elif byte_code in self.opc.FREE_OPS: if int_arg < len(f_code.co_cellvars): arg = f_code.co_cellvars[int_arg] else: var_idx = int_arg - len(f.f_code.co_cellvars) arg = f_code.co_freevars[var_idx] elif byte_code in self.opc.NAME_OPS: arg = f_code.co_names[int_arg] elif byte_code in self.opc.JREL_OPS: # Many relative jumps are conditional, # so setting f.fallthrough is wrong. arg = arg_offset + int_arg elif byte_code in self.opc.JABS_OPS: # We probably could set fallthough, since many (all?) # of these are unconditional, but we'll make the jump do # the work of setting. arg = int_arg elif byte_code in self.opc.LOCAL_OPS: arg = f_code.co_varnames[int_arg] else: arg = int_arg arguments = [arg] break return byte_name, byte_code, int_arg, arguments, offset, line_number
def detect_control_flow(self, offset, targets, extended_arg): """ Detect structures and their boundaries to fix optimized jumps in python2.3+ """ # TODO: check the struct boundaries more precisely -Dan code = self.code op = code[offset] # Detect parent structure parent = self.structs[0] start = parent['start'] end = parent['end'] # Pick inner-most parent for our offset for struct in self.structs: current_start = struct['start'] current_end = struct['end'] if ((current_start <= offset < current_end) and (current_start >= start and current_end <= end)): start = current_start end = current_end parent = struct if op == self.opc.SETUP_LOOP: # We categorize loop types: 'for', 'while', 'while 1' with # possibly suffixes '-loop' and '-else' # Try to find the jump_back instruction of the loop. # It could be a return instruction. start += instruction_size(op, self.opc) target = self.get_target(offset, extended_arg) end = self.restrict_to_parent(target, parent) self.setup_loops[target] = offset if target != end: self.fixed_jumps[offset] = end (line_no, next_line_byte) = self.lines[offset] jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, next_line_byte, False) if jump_back: jump_forward_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) else: jump_forward_offset = None return_val_offset1 = self.prev[self.prev[end]] if (jump_back and jump_back != self.prev_op[end] and self.is_jump_forward(jump_forward_offset)): if (code[self.prev_op[end]] == self.opc.RETURN_VALUE or (code[self.prev_op[end]] == self.opc.POP_BLOCK and code[return_val_offset1] == self.opc.RETURN_VALUE)): jump_back = None if not jump_back: jump_back = self.last_instr(start, end, self.opc.RETURN_VALUE) if not jump_back: return jump_back += 2 # FIXME ??? if_offset = None if code[self.prev_op[next_line_byte]] not in self.pop_jump_tf: if_offset = self.prev[next_line_byte] if if_offset: loop_type = 'while' self.ignore_if.add(if_offset) else: loop_type = 'for' target = next_line_byte end = xdis.next_offset(code[jump_back], self.opc, jump_back) else: if self.get_target(jump_back, 0) >= next_line_byte: jump_back = self.last_instr(start, end, self.opc.JUMP_ABSOLUTE, start, False) if end > jump_back+4 and self.is_jump_forward(end): if self.is_jump_forward(jump_back+4): if self.get_target(jump_back+4, extended_arg) == self.get_target(end, extended_arg): self.fixed_jumps[offset] = jump_back+4 end = jump_back+4 elif target < offset: self.fixed_jumps[offset] = jump_back+4 end = jump_back+4 # I think 0 right because jump_back has been adjusted for any EXTENDED_ARG # it encounters target = self.get_target(jump_back, 0) if code[target] in (self.opc.FOR_ITER, self.opc.GET_ITER): loop_type = 'for' else: loop_type = 'while' test = self.prev_op[next_line_byte] if test == offset: loop_type = 'while 1' elif self.code[test] in self.opc.JUMP_OPs: self.ignore_if.add(test) test_target = self.get_target(test, extended_arg) if test_target > (jump_back+3): jump_back = test_target self.not_continue.add(jump_back) self.loops.append(target) self.structs.append({'type': loop_type + '-loop', 'start': target, 'end': jump_back}) after_jump_offset = xdis.next_offset(code[jump_back], self.opc, jump_back) if after_jump_offset != end: self.structs.append({'type': loop_type + '-else', 'start': after_jump_offset, 'end': end}) elif op in self.pop_jump_tf: start = offset + instruction_size(op, self.opc) target = self.get_target(offset, extended_arg) rtarget = self.restrict_to_parent(target, parent) prev_op = self.prev_op # Do not let jump to go out of parent struct bounds if target != rtarget and parent['type'] == 'and/or': self.fixed_jumps[offset] = rtarget return # Does this jump to right after another conditional jump that is # not myself? If so, it's part of a larger conditional. # rocky: if we have a conditional jump to the next instruction, then # possibly I am "skipping over" a "pass" or null statement. if ((code[prev_op[target]] in self.pop_jump_if_pop) and (target > offset) and prev_op[target] != offset): # FIXME: this is not accurate The commented out below # is what it should be. However grammar rules right now # assume the incorrect offsets. # self.fixed_jumps[offset] = target self.fixed_jumps[offset] = prev_op[target] self.structs.append({'type': 'and/or', 'start': start, 'end': prev_op[target]}) return # The opcode *two* instructions before the target jump offset is important # in making a determination of what we have. Save that. pre_rtarget = prev_op[rtarget] # Is it an "and" inside an "if" or "while" block if op == self.opc.POP_JUMP_IF_FALSE and self.version < 3.6: # Search for another POP_JUMP_IF_FALSE targetting the same op, # in current statement, starting from current offset, and filter # everything inside inner 'or' jumps and midline ifs match = self.rem_or(start, self.next_stmt[offset], self.opc.POP_JUMP_IF_FALSE, target) # If we still have any offsets in set, start working on it if match: is_jump_forward = self.is_jump_forward(pre_rtarget) if (is_jump_forward and pre_rtarget not in self.stmts and self.restrict_to_parent(self.get_target(pre_rtarget, extended_arg), parent) == rtarget): if (code[prev_op[pre_rtarget]] == self.opc.JUMP_ABSOLUTE and self.remove_mid_line_ifs([offset]) and target == self.get_target(prev_op[pre_rtarget], extended_arg) and (prev_op[pre_rtarget] not in self.stmts or self.get_target(prev_op[pre_rtarget], extended_arg) > prev_op[pre_rtarget]) and 1 == len(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], self.pop_jump_tf, target)))): pass elif (code[prev_op[pre_rtarget]] == self.opc.RETURN_VALUE and self.remove_mid_line_ifs([offset]) and 1 == (len(set(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], self.pop_jump_tf, target))) | set(self.remove_mid_line_ifs(self.rem_or(start, prev_op[pre_rtarget], (self.opc.POP_JUMP_IF_FALSE, self.opc.POP_JUMP_IF_TRUE, self.opc.JUMP_ABSOLUTE), pre_rtarget, True)))))): pass else: fix = None jump_ifs = self.all_instr(start, self.next_stmt[offset], self.opc.POP_JUMP_IF_FALSE) last_jump_good = True for j in jump_ifs: if target == self.get_target(j, extended_arg): if self.lines[j].next == j + 3 and last_jump_good: fix = j break else: last_jump_good = False self.fixed_jumps[offset] = fix or match[-1] return else: self.fixed_jumps[offset] = match[-1] return # op == POP_JUMP_IF_TRUE else: next = self.next_stmt[offset] if prev_op[next] == offset: pass elif self.is_jump_forward(next) and target == self.get_target(next, extended_arg): if code[prev_op[next]] == self.opc.POP_JUMP_IF_FALSE: if (code[next] == self.opc.JUMP_FORWARD or target != rtarget or code[prev_op[pre_rtarget]] not in (self.opc.JUMP_ABSOLUTE, self.opc.RETURN_VALUE)): self.fixed_jumps[offset] = prev_op[next] return elif (code[next] == self.opc.JUMP_ABSOLUTE and self.is_jump_forward(target) and self.get_target(target, extended_arg) == self.get_target(next, extended_arg)): self.fixed_jumps[offset] = prev_op[next] return # Don't add a struct for a while test, it's already taken care of if offset in self.ignore_if: return if (code[pre_rtarget] == self.opc.JUMP_ABSOLUTE and pre_rtarget in self.stmts and pre_rtarget != offset and prev_op[pre_rtarget] != offset and not (code[rtarget] == self.opc.JUMP_ABSOLUTE and code[rtarget+3] == self.opc.POP_BLOCK and code[prev_op[pre_rtarget]] != self.opc.JUMP_ABSOLUTE)): rtarget = pre_rtarget # Does the "jump if" jump beyond a jump op? # That is, we have something like: # POP_JUMP_IF_FALSE HERE # ... # JUMP_FORWARD # HERE: # # If so, this can be block inside an "if" statement # or a conditional assignment like: # x = 1 if x else 2 # # There are other contexts we may need to consider # like whether the target is "END_FINALLY" # or if the condition jump is to a forward location if self.is_jump_forward(pre_rtarget): if_end = self.get_target(pre_rtarget, 0) # If the jump target is back, we are looping if (if_end < pre_rtarget and (code[prev_op[if_end]] == self.opc.SETUP_LOOP)): if (if_end > start): return end = self.restrict_to_parent(if_end, parent) self.structs.append({'type': 'if-then', 'start': start, 'end': pre_rtarget}) # FIXME: add this # self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) if rtarget < end and ( code[rtarget] not in (self.opc.END_FINALLY, self.opc.JUMP_ABSOLUTE) and code[prev_op[pre_rtarget]] not in (self.opc.POP_EXCEPT, self.opc.END_FINALLY)): self.structs.append({'type': 'else', 'start': rtarget, 'end': end}) self.else_start[rtarget] = end elif self.is_jump_back(pre_rtarget, 0): if_end = rtarget self.structs.append({'type': 'if-then', 'start': start, 'end': pre_rtarget}) self.not_continue.add(pre_rtarget) elif code[pre_rtarget] in (self.opc.RETURN_VALUE, self.opc.BREAK_LOOP): self.structs.append({'type': 'if-then', 'start': start, 'end': rtarget}) # It is important to distingish if this return is inside some sort # except block return jump_prev = prev_op[offset] if self.is_pypy and code[jump_prev] == self.opc.COMPARE_OP: if self.opc.cmp_op[code[jump_prev+1]] == 'exception-match': return if self.version >= 3.5: # Python 3.5 may remove as dead code a JUMP # instruction after a RETURN_VALUE. So we check # based on seeing SETUP_EXCEPT various places. if code[rtarget] == self.opc.SETUP_EXCEPT: return # Check that next instruction after pops and jump is # not from SETUP_EXCEPT next_op = rtarget if code[next_op] == self.opc.POP_BLOCK: next_op += instruction_size(self.code[next_op], self.opc) if code[next_op] == self.opc.JUMP_ABSOLUTE: next_op += instruction_size(self.code[next_op], self.opc) if next_op in targets: for try_op in targets[next_op]: come_from_op = code[try_op] if come_from_op == self.opc.SETUP_EXCEPT: return pass pass if code[pre_rtarget] == self.opc.RETURN_VALUE: self.return_end_ifs.add(pre_rtarget) else: self.fixed_jumps[offset] = rtarget self.not_continue.add(pre_rtarget) else: # For now, we'll only tag forward jump. if self.version >= 3.6: if target > offset: self.fixed_jumps[offset] = target pass else: # FIXME: This is probably a bug in < 3.6 and we should # instead use the above code. But until we smoke things # out we'll stick with it. if rtarget > offset: self.fixed_jumps[offset] = rtarget elif op == self.opc.SETUP_EXCEPT: target = self.get_target(offset, extended_arg) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op == self.opc.POP_EXCEPT: next_offset = xdis.next_offset(op, self.opc, offset) target = self.get_target(next_offset, extended_arg) if target > next_offset: next_op = code[next_offset] if (self.opc.JUMP_ABSOLUTE == next_op and self.opc.END_FINALLY != code[xdis.next_offset(next_op, self.opc, next_offset)]): self.fixed_jumps[next_offset] = target self.except_targets[target] = next_offset elif op == self.opc.SETUP_FINALLY: target = self.get_target(offset, extended_arg) end = self.restrict_to_parent(target, parent) self.fixed_jumps[offset] = end elif op in self.jump_if_pop: target = self.get_target(offset, extended_arg) if target > offset: unop_target = self.last_instr(offset, target, self.opc.JUMP_FORWARD, target) if unop_target and code[unop_target+3] != self.opc.ROT_TWO: self.fixed_jumps[offset] = unop_target else: self.fixed_jumps[offset] = self.restrict_to_parent(target, parent) pass pass elif self.version >= 3.5: # 3.5+ has Jump optimization which too often causes RETURN_VALUE to get # misclassified as RETURN_END_IF. Handle that here. # In RETURN_VALUE, JUMP_ABSOLUTE, RETURN_VALUE is never RETURN_END_IF if op == self.opc.RETURN_VALUE: next_offset = xdis.next_offset(op, self.opc, offset) if (next_offset < len(code) and code[next_offset] == self.opc.JUMP_ABSOLUTE and offset in self.return_end_ifs): self.return_end_ifs.remove(offset) pass pass elif op == self.opc.JUMP_FORWARD: # If we have: # JUMP_FORWARD x, [non-jump, insns], RETURN_VALUE, x: # then RETURN_VALUE is not RETURN_END_IF rtarget = self.get_target(offset, extended_arg) rtarget_prev = self.prev[rtarget] if (code[rtarget_prev] == self.opc.RETURN_VALUE and rtarget_prev in self.return_end_ifs): i = rtarget_prev while i != offset: if code[i] in [op3.JUMP_FORWARD, op3.JUMP_ABSOLUTE]: return i = self.prev[i] self.return_end_ifs.remove(rtarget_prev) pass return
def basic_blocks( fn_or_code, version=PYTHON_VERSION_TRIPLE, is_pypy=IS_PYPY, more_precise_returns=False, print_instructions=False, ): """Create a list of basic blocks found in a code object. `more_precise_returns` indicates whether the RETURN_VALUE should modeled as a jump to the end of the enclosing function or not. See comment in code as to why this might be useful. """ BB = BBMgr(version, is_pypy) # Get jump targets jump_targets = set() instructions = list(get_instructions(fn_or_code)) for inst in instructions: op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if op in BB.JUMP_INSTRUCTIONS: jump_value = get_jump_val(inst.arg, version) if op in BB.JABS_INSTRUCTIONS: jump_offset = jump_value else: jump_offset = follow_offset + jump_value jump_targets.add(jump_offset) pass # Add an artificial block where we can link the exits of other blocks # to. This helps when there is a "raise" not in any try block and # in computing reverse dominators. end_offset = instructions[-1].offset if version >= (3, 6): end_bb_offset = end_offset + 2 else: end_bb_offset = end_offset + 1 end_block, _, _ = BB.add_bb( end_bb_offset, end_bb_offset, None, None, set([BB_EXIT]), [] ) start_offset = 0 end_offset = -1 jump_offsets = set() prev_offset = -1 endloop_offsets = [-1] flags = set([BB_ENTRY]) end_try_offset_stack = [] try_stack = [end_block] end_try_offset = None loop_offset = None return_blocks = [] for i, inst in enumerate(instructions): if print_instructions: print(inst) prev_offset = end_offset end_offset = inst.offset op = inst.opcode offset = inst.offset follow_offset = next_offset(op, BB.opcode, offset) if offset == end_try_offset: if len(end_try_offset_stack): end_try_offset = end_try_offset_stack[-1] end_try_offset_stack.pop() else: end_try_offset = None if op in BB.LOOP_INSTRUCTIONS: jump_offset = follow_offset + inst.arg endloop_offsets.append(jump_offset) loop_offset = offset elif offset == endloop_offsets[-1]: endloop_offsets.pop() pass if op in BB.LOOP_INSTRUCTIONS: flags.add(BB_LOOP) elif op in BB.BREAK_INSTRUCTIONS: flags.add(BB_BREAK) jump_offsets.add(endloop_offsets[-1]) block, flags, jump_offsets = BB.add_bb( start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets, ) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) start_offset = follow_offset if offset in jump_targets: # Fallthrough path and jump target path. # This instruction definitely starts a new basic block # Close off any prior basic block if start_offset < end_offset: block, flags, jump_offsets = BB.add_bb( start_offset, prev_offset, loop_offset, end_offset, flags, jump_offsets, ) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) pass start_offset = end_offset pass # Add block flags for certain classes of instructions if op in BB.JUMP_CONDITONAL: flags.add(BB_JUMP_CONDITIONAL) if op in BB.POP_BLOCK_INSTRUCTIONS: flags.add(BB_POP_BLOCK) if start_offset == offset: flags.add(BB_STARTS_POP_BLOCK) flags.remove(BB_POP_BLOCK) elif op in BB.EXCEPT_INSTRUCTIONS: if sys.version_info[0:2] <= (2, 7): # In Python up to 2.7, thre'POP_TOP'S at the beginning of a block # indicate an exception handler. We also check # that we are nested inside a "try". if len(try_stack) == 0 or start_offset != offset: continue pass if ( instructions[i + 1].opcode != BB.opcode.opmap["POP_TOP"] or instructions[i + 2].opcode != BB.opcode.opmap["POP_TOP"] ): continue flags.add(BB_EXCEPT) try_stack[-1].exception_offsets.add(start_offset) pass elif op in BB.TRY_INSTRUCTIONS: end_try_offset_stack.append(inst.argval) flags.add(BB_TRY) elif op in BB.END_FINALLY_INSTRUCTIONS: flags.add(BB_END_FINALLY) try_stack[-1].exception_offsets.add(start_offset) elif op in BB.FOR_INSTRUCTIONS: flags.add(BB_FOR) jump_offsets.add(inst.argval) block, flags, jump_offsets = BB.add_bb( start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets, ) loop_offset = None start_offset = follow_offset elif op in BB.JUMP_INSTRUCTIONS: # Some sort of jump instruction. # Figure out where we jump to amd add it to this # basic block's jump offsets. jump_offset = inst.argval jump_offsets.add(jump_offset) if op in BB.JUMP_UNCONDITONAL: flags.add(BB_JUMP_UNCONDITIONAL) if jump_offset == follow_offset: flags.add(BB_JUMP_TO_FALLTHROUGH) pass block, flags, jump_offsets = BB.add_bb( start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets, ) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) pass start_offset = follow_offset elif version[:2] >= (3, 9) or ( version[:2] < (3, 8) and op != BB.opcode.SETUP_LOOP ): if op in BB.FINALLY_INSTRUCTIONS: flags.add(BB_FINALLY) block, flags, jump_offsets = BB.add_bb( start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets, ) loop_offset = None if BB_TRY in block.flags: try_stack.append(block) start_offset = follow_offset pass elif op in BB.NOFOLLOW_INSTRUCTIONS: flags.add(BB_NOFOLLOW) if op in BB.RETURN_INSTRUCTIONS: flags.add(BB_RETURN) last_block, flags, jump_offsets = BB.add_bb( start_offset, end_offset, loop_offset, follow_offset, flags, jump_offsets, ) loop_offset = None start_offset = follow_offset if op in BB.RETURN_INSTRUCTIONS: return_blocks.append(last_block) pass pass # If the bytecode comes from Python, then there is possibly an # advantage in treating a return in a block as an instruction # which flows to the next instruction, since that will treat # blocks with unreachable instructions the way Python source # does - the code after that exists. # # However if you care about analysis, then # Hook RETURN_VALUE instructions to the exit block offset if more_precise_returns: for block in return_blocks: block.jump_offsets.add(end_bb_offset) block.edge_count += 1 if len(BB.bb_list): BB.bb_list[-1].follow_offset = None BB.start_block = BB.bb_list[0] # Add remaining instructions? if start_offset <= end_offset: BB.bb_list.append( BasicBlock( start_offset, end_offset, loop_offset, None, flags=flags, jump_offsets=jump_offsets, ) ) loop_offset = None pass return BB