Esempio n. 1
0
def number_loop(queue, mappings, opc):
    while len(queue) > 0:
        code1 = queue.popleft()
        code2 = queue.popleft()
        assert code1.co_name == code2.co_name
        linestarts_orig = findlinestarts(code1)
        linestarts_uncompiled = list(findlinestarts(code2))
        mappings += [[line, offset2line(offset, linestarts_uncompiled)]
                     for offset, line in linestarts_orig]
        bytecode1 = Bytecode(code1, opc)
        bytecode2 = Bytecode(code2, opc)
        instr2s = bytecode2.get_instructions(code2)
        seen = set([code1.co_name])
        for instr in bytecode1.get_instructions(code1):
            next_code1 = None
            if iscode(instr.argval):
                next_code1 = instr.argval
            if next_code1:
                next_code2 = None
                while not next_code2:
                    try:
                        instr2 = next(instr2s)
                        if iscode(instr2.argval):
                            next_code2 = instr2.argval
                            pass
                    except StopIteration:
                        break
                    pass
                if next_code2:
                    assert next_code1.co_name == next_code2.co_name
                    if next_code1.co_name not in seen:
                        seen.add(next_code1.co_name)
                        queue.append(next_code1)
                        queue.append(next_code2)
                        pass
                    pass
            pass
        pass
Esempio n. 2
0
def number_loop(queue, mappings, opc):
    while len(queue) > 0:
        code1 = queue.popleft()
        code2 = queue.popleft()
        assert code1.co_name == code2.co_name
        linestarts_orig = findlinestarts(code1)
        linestarts_uncompiled = list(findlinestarts(code2))
        mappings += [[line, offset2line(offset, linestarts_uncompiled)] for offset, line in linestarts_orig]
        bytecode1 = Bytecode(code1, opc)
        bytecode2 = Bytecode(code2, opc)
        instr2s = bytecode2.get_instructions(code2)
        seen = set([code1.co_name])
        for instr in bytecode1.get_instructions(code1):
            next_code1 = None
            if iscode(instr.argval):
                next_code1 = instr.argval
            if next_code1:
                next_code2 = None
                while not next_code2:
                    try:
                        instr2 = next(instr2s)
                        if iscode(instr2.argval):
                            next_code2 = instr2.argval
                            pass
                    except StopIteration:
                        break
                    pass
                if next_code2:
                    assert next_code1.co_name == next_code2.co_name
                    if next_code1.co_name not in seen:
                        seen.add(next_code1.co_name)
                        queue.append(next_code1)
                        queue.append(next_code2)
                        pass
                    pass
            pass
        pass
Esempio n. 3
0
def test_inst_size():
    if (PYTHON_VERSION_TRIPLE[:2] == (3, 6)) and not IS_PYPY:
        opc = get_opcode_module(sys.version_info)
        bytecode_obj = Bytecode(extended_arg_fn36, opc)
        instructions = list(bytecode_obj.get_instructions(extended_arg_fn36))

        inst1 = instructions[1]
        assert inst1.opname == 'EXTENDED_ARG'
        assert inst1.argval == 0

        inst2 = instructions[2]
        assert inst2.opname == 'POP_JUMP_IF_FALSE'
        assert inst2.has_extended_arg == True
        assert inst2.inst_size == 4

        # for inst in instructions:
        #     print(inst)
    else:
        assert True
Esempio n. 4
0
def test_inst_jumps():
    if (sys.version_info >= (2, 7)):
        variant = 'pypy' if IS_PYPY else None
        opc = get_opcode_module(sys.version_info, variant)
        bytecode_obj = Bytecode(extended_arg_fn36, opc)
        instructions = list(bytecode_obj.get_instructions(loop))
        seen_pjif = False
        seen_ja = False
        for inst in instructions:
            if inst.opname == "POP_JUMP_IF_FALSE":
                assert inst.is_jump()
                seen_pjif = True
            elif inst.opname == "JUMP_ABSOLUTE":
                assert inst.is_jump()
                assert not inst.jumps_forward()
                seen_ja = True
                pass
            pass
        assert seen_pjif
        assert seen_ja
Esempio n. 5
0
def test_inst_size():
    if (sys.version_info == (3, 6)):
        variant = 'pypy' if IS_PYPY else None
        opc = get_opcode_module(sys.version_info, variant)
        bytecode_obj = Bytecode(extended_arg_fn36, opc)
        instructions = list(bytecode_obj.get_instructions(extended_arg_fn36))

        inst1 = instructions[1]
        assert inst1.opname == 'EXTENDED_ARG'
        assert inst1.argval == 0

        inst2 = instructions[2]
        assert inst2.opname == 'POP_JUMP_IF_FALSE'
        assert inst2.has_extended_arg == True
        assert inst2.inst_size == 4

        # for inst in instructions:
        #     print(inst)
    else:
        assert True
Esempio n. 6
0
def test_inst_size():
    if (sys.version_info == (3,6)):
        variant = 'pypy' if IS_PYPY else None
        opc = get_opcode_module(sys.version_info, variant)
        bytecode_obj = Bytecode(extended_arg_fn36, opc)
        instructions = list(bytecode_obj.get_instructions(extended_arg_fn36))

        inst1 = instructions[1]
        assert inst1.opname == 'EXTENDED_ARG'
        assert inst1.argval == 0

        inst2 = instructions[2]
        assert inst2.opname == 'POP_JUMP_IF_FALSE'
        assert inst2.has_extended_arg == True
        assert inst2.inst_size == 4

        # for inst in instructions:
        #     print(inst)
    else:
        assert True
Esempio n. 7
0
def test_inst_jumps():
    if (sys.version_info >= (2, 7)):
        variant = 'pypy' if IS_PYPY else None
        opc = get_opcode_module(sys.version_info, variant)
        bytecode_obj = Bytecode(extended_arg_fn36, opc)
        instructions = list(bytecode_obj.get_instructions(loop))
        seen_pjif = False
        seen_ja = False
        for inst in instructions:
            if inst.opname == "POP_JUMP_IF_FALSE":
                assert inst.is_jump()
                seen_pjif = True
            elif inst.opname == "JUMP_ABSOLUTE":
                assert inst.is_jump()
                assert not inst.jumps_forward()
                seen_ja = True
                pass
            pass
        assert seen_pjif
        # Python 3.10 code generation is more efficient and doesn't
        # and removes a JUMP_ABSOLUTE.
        if PYTHON_VERSION_TRIPLE < (3, 10):
            assert seen_ja
Esempio n. 8
0
    def disassemble(self, co, classname=None, code_objects={}, show_asm=None):
        '''
        Disassemble a code object, returning a list of 'Token'.

        The main part of this procedure is modelled after
        dis.disassemble().
        '''

        show_asm = self.show_asm if not show_asm else show_asm
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # from xdis.bytecode import Bytecode
        # bytecode = Bytecode(co, self.opc)
        # for instr in bytecode.get_instructions(co):
        #     print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        Token = self.Token # shortcut

        n = self.setup_code(co)

        self.build_lines_data(co, n)
        self.build_prev_op(n)

        # linestarts contains block code adresses (addr,block)
        self.linestarts = list(findlinestarts(co))

        # class and names
        if classname:
            classname = '_' + classname.lstrip('_') + '__'

            def unmangle(name):
                if name.startswith(classname) and name[-2:] != '__':
                    return name[len(classname) - 2:]
                return name

            free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ]
            names = [ unmangle(name) for name in co.co_names ]
            varnames = [ unmangle(name) for name in co.co_varnames ]
        else:
            free = co.co_cellvars + co.co_freevars
            names = co.co_names
            varnames = co.co_varnames
        self.names = names

        codelen = len(self.code)

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, n):
            # We need to detect the difference between
            # "raise AssertionError" and
            # "assert"
            if (self.code[i] == self.opc.JUMP_IF_TRUE and
                i + 4 < codelen and
                self.code[i+3] == self.opc.POP_TOP and
                self.code[i+4] == self.opc.LOAD_GLOBAL):
                if names[self.get_argument(i+4)] == 'AssertionError':
                    self.load_asserts.add(i+4)

        cf = self.find_jump_targets()
        # contains (code, [addrRefToCode])

        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < codelen - 1:
            if self.lines[last_stmt].next > i:
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        imports = self.all_instr(0, codelen,
                                (self.opc.IMPORT_NAME, self.opc.IMPORT_FROM,
                                 self.opc.IMPORT_STAR))
        # Changes IMPORT_NAME to IMPORT_NAME_CONT.
        # Possibly a Python 2.0 hangover
        if len(imports) > 1 and self.version < 2.3:
            last_import = imports[0]
            for i in imports[1:]:
                if self.lines[last_import].next > i:
                    if self.code[last_import] == self.opc.IMPORT_NAME == self.code[i]:
                        replace[i] = 'IMPORT_NAME_CONT'
                last_import = i

        extended_arg = 0
        for offset in self.op_range(0, codelen):
            op = self.code[offset]
            op_name = self.opname[op]
            oparg = None; pattr = None

            if offset in cf:
                k = 0
                for j in cf[offset]:
                    tokens.append(Token('COME_FROM', None, repr(j),
                                        offset="%s_%d" % (offset, k),
                                        has_arg = True))
                    k += 1

            has_arg = (op >= self.opc.HAVE_ARGUMENT)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    raise NotImplementedError
                    extended_arg = oparg * scan.L65536
                    continue
                if op in self.opc.hasconst:
                    const = co.co_consts[oparg]
                    # We can't use inspect.iscode() because we may be
                    # using a different version of Python than the
                    # one that this was byte-compiled on. So the code
                    # types may mismatch.
                    if hasattr(const, 'co_name'):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert op_name == 'LOAD_CONST'
                            op_name = 'LOAD_LAMBDA'
                        elif const.co_name == self.genexpr_name:
                            op_name = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            op_name = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            op_name = 'LOAD_SETCOMP'
                        # verify uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' % \
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.hasname:
                    pattr = names[oparg]
                elif op in self.opc.hasjrel:
                    pattr = repr(offset + 3 + oparg)
                    if op == self.opc.JUMP_FORWARD:
                        target = self.get_target(offset)
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if len(tokens) and tokens[-1].type == 'JUMP_BACK':
                            tokens[-1].type = intern('CONTINUE')

                elif op in self.opc.hasjabs:
                    pattr = repr(oparg)
                elif op in self.opc.haslocal:
                    pattr = varnames[oparg]
                elif op in self.opc.hascompare:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.hasfree:
                    pattr = free[oparg]
            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if (op == self.opc.BUILD_TUPLE and
                    self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE):
                    continue
                else:
                    op_name = '%s_%d' % (op_name, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[op_name] = oparg
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classifhy JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead.
                target = self.get_target(offset)
                if target <= offset:
                    if (offset in self.stmts
                        and self.code[offset+3] not in (self.opc.END_FINALLY,
                                                          self.opc.POP_BLOCK)
                        and offset not in self.not_continue):
                        op_name = 'CONTINUE'
                    else:
                        op_name = 'JUMP_BACK'
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if tokens[-1].type == 'JUMP_BACK':
                            tokens[-1].type = intern('CONTINUE')

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    op_name = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    op_name = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(Token(
                    op_name, oparg, pattr, offset, linestart, op, has_arg))
            else:
                tokens.append(Token(
                    replace[offset], oparg, pattr, offset, linestart, op, has_arg))
                pass
            pass

        if show_asm:
            for t in tokens:
                print(t)
            print()
        return tokens, customize
Esempio n. 9
0
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'after'
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1

        Token = self.Token  # shortcut

        n = self.setup_code(co)

        self.build_lines_data(co, n)
        self.build_prev_op(n)

        free, names, varnames = self.unmangle_code_names(co, classname)
        self.names = names

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, n):
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
            #   assert ...
            # Below we use the heuristic that it is preceded by a POP_JUMP.
            # however we could also use followed by RAISE_VARARGS
            # or for PyPy there may be a JUMP_IF_NOT_DEBUG before.
            # FIXME: remove uses of PJIF, and PJIT
            if self.is_pypy:
                have_pop_jump = self.code[i] in (self.opc.PJIF, self.opc.PJIT)
            else:
                have_pop_jump = self.code[i] == self.opc.PJIT

            if have_pop_jump and self.code[i + 3] == self.opc.LOAD_GLOBAL:
                if names[self.get_argument(i + 3)] == 'AssertionError':
                    self.load_asserts.add(i + 3)

        jump_targets = self.find_jump_targets(show_asm)
        # contains (code, [addrRefToCode])

        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < n - 1:
            if self.lines[last_stmt].next > i:
                # Distinguish "print ..." from "print ...,"
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        extended_arg = 0
        for offset in self.op_range(0, n):
            if offset in jump_targets:
                jump_idx = 0
                # We want to process COME_FROMs to the same offset to be in *descending*
                # offset order so we have the larger range or biggest instruction interval
                # last. (I think they are sorted in increasing order, but for safety
                # we sort them). That way, specific COME_FROM tags will match up
                # properly. For example, a "loop" with an "if" nested in it should have the
                # "loop" tag last so the grammar rule matches that properly.
                # last_offset = -1
                for jump_offset in sorted(jump_targets[offset], reverse=True):
                    # if jump_offset == last_offset:
                    #     continue
                    # last_offset = jump_offset
                    come_from_name = 'COME_FROM'
                    op_name = self.opc.opname[self.code[jump_offset]]
                    if op_name.startswith('SETUP_') and self.version == 2.7:
                        come_from_type = op_name[len('SETUP_'):]
                        if come_from_type not in ('LOOP', 'EXCEPT'):
                            come_from_name = 'COME_FROM_%s' % come_from_type
                        pass
                    tokens.append(
                        Token(come_from_name,
                              None,
                              repr(jump_offset),
                              offset="%s_%d" % (offset, jump_idx),
                              has_arg=True))
                    jump_idx += 1

            op = self.code[offset]
            op_name = self.opc.opname[op]

            oparg = None
            pattr = None
            has_arg = op_has_argument(op, self.opc)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    extended_arg = oparg * scan.L65536
                    continue
                if op in self.opc.hasconst:
                    const = co.co_consts[oparg]
                    if iscode(const):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert op_name == 'LOAD_CONST'
                            op_name = 'LOAD_LAMBDA'
                        elif const.co_name == '<genexpr>':
                            op_name = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            op_name = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            op_name = 'LOAD_SETCOMP'
                        # verify() uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' %\
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.hasname:
                    pattr = names[oparg]
                elif op in self.opc.hasjrel:
                    #  use instead: hasattr(self, 'patch_continue'): ?
                    if self.version == 2.7:
                        self.patch_continue(tokens, offset, op)
                    pattr = repr(offset + 3 + oparg)
                elif op in self.opc.hasjabs:
                    # use instead: hasattr(self, 'patch_continue'): ?
                    if self.version == 2.7:
                        self.patch_continue(tokens, offset, op)
                    pattr = repr(oparg)
                elif op in self.opc.haslocal:
                    pattr = varnames[oparg]
                elif op in self.opc.hascompare:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.hasfree:
                    pattr = free[oparg]

            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if op == self.opc.BUILD_TUPLE and \
                    self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE:
                    continue
                else:
                    if self.is_pypy and not oparg and op_name == 'BUILD_MAP':
                        op_name = 'BUILD_MAP_n'
                    else:
                        op_name = '%s_%d' % (op_name, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[op_name] = oparg
            elif self.is_pypy and op_name in ('LOOKUP_METHOD',
                                              'JUMP_IF_NOT_DEBUG',
                                              'SETUP_EXCEPT', 'SETUP_FINALLY'):
                # The value in the dict is in special cases in semantic actions, such
                # as CALL_FUNCTION. The value is not used in these cases, so we put
                # in arbitrary value 0.
                customize[op_name] = 0
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead. In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                target = self.get_target(offset)
                if target <= offset:
                    if (offset in self.stmts and self.code[offset + 3]
                            not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)
                            and offset not in self.not_continue):
                        op_name = 'CONTINUE'
                    else:
                        op_name = 'JUMP_BACK'

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    op_name = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    op_name = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(
                    Token(op_name, oparg, pattr, offset, linestart, op,
                          has_arg, self.opc))
            else:
                tokens.append(
                    Token(replace[offset], oparg, pattr, offset, linestart, op,
                          has_arg, self.opc))
                pass
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t.format(line_prefix='L.'))
            print()
        return tokens, customize
Esempio n. 10
0
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'after'
        if show_asm in ('both', 'before'):
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1

        self.code = array('B', co.co_code)
        self.build_lines_data(co)
        self.build_prev_op()

        bytecode = Bytecode(co, self.opc)

        # FIXME: put as its own method?
        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        bs = list(bytecode)
        n = len(bs)
        for i in range(n):
            inst = bs[i]

            # We need to detect the difference between
            # "raise AssertionError" and "assert"
            # If we have a JUMP_FORWARD after the
            # RAISE_VARARGS then we have a "raise" statement
            # else we have an "assert" statement.
            if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
                next_inst = bs[i+1]
                if (next_inst.opname == 'LOAD_GLOBAL' and
                    next_inst.argval == 'AssertionError'):
                    for j in range(i+2, n):
                        raise_inst = bs[j]
                        if raise_inst.opname.startswith('RAISE_VARARGS'):
                            if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD':
                                self.load_asserts.add(next_inst.offset)
                                pass
                            break
                    pass
                pass

        # Get jump targets
        # Format: {target offset: [jump offsets]}
        jump_targets = self.find_jump_targets(show_asm)

        for inst in bytecode:

            argval = inst.argval
            if inst.offset in jump_targets:
                jump_idx = 0
                # We want to process COME_FROMs to the same offset to be in *descending*
                # offset order so we have the larger range or biggest instruction interval
                # last. (I think they are sorted in increasing order, but for safety
                # we sort them). That way, specific COME_FROM tags will match up
                # properly. For example, a "loop" with an "if" nested in it should have the
                # "loop" tag last so the grammar rule matches that properly.
                for jump_offset in sorted(jump_targets[inst.offset], reverse=True):
                    come_from_name = 'COME_FROM'
                    opname = self.opName(jump_offset)
                    if opname.startswith('SETUP_'):
                        come_from_type = opname[len('SETUP_'):]
                        come_from_name = 'COME_FROM_%s' % come_from_type
                        pass
                    tokens.append(Token(come_from_name,
                                        None, repr(jump_offset),
                                        offset='%s_%s' % (inst.offset, jump_idx),
                                        has_arg = True, opc=self.opc))
                    jump_idx += 1
                    pass
                pass
            elif inst.offset in self.else_start:
                end_offset = self.else_start[inst.offset]
                tokens.append(Token('ELSE',
                                    None, repr(end_offset),
                                    offset='%s' % (inst.offset),
                                    has_arg = True, opc=self.opc))

                pass

            pattr =  inst.argrepr
            opname = inst.opname
            op = inst.opcode

            if opname in ['LOAD_CONST']:
                const = inst.argval
                if iscode(const):
                    if const.co_name == '<lambda>':
                        opname = 'LOAD_LAMBDA'
                    elif const.co_name == '<genexpr>':
                        opname = 'LOAD_GENEXPR'
                    elif const.co_name == '<dictcomp>':
                        opname = 'LOAD_DICTCOMP'
                    elif const.co_name == '<setcomp>':
                        opname = 'LOAD_SETCOMP'
                    elif const.co_name == '<listcomp>':
                        opname = 'LOAD_LISTCOMP'
                    # verify() uses 'pattr' for comparison, since 'attr'
                    # now holds Code(const) and thus can not be used
                    # for comparison (todo: think about changing this)
                    # pattr = 'code_object @ 0x%x %s->%s' %\
                    # (id(const), const.co_filename, const.co_name)
                    pattr = '<code_object ' + const.co_name + '>'
                else:
                    pattr = const
                    pass
            elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'):
                pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval)
                if name_pair_args > 0:
                    opname = '%s_N%d' % (opname, name_pair_args)
                    pass
                if annotate_args > 0:
                    opname = '%s_A_%d' % (opname, annotate_args)
                    pass
                opname = '%s_%d' % (opname, pos_args)
                pattr = ("%d positional, %d keyword pair, %d annotated" %
                             (pos_args, name_pair_args, annotate_args))
                tokens.append(
                    Token(
                        type_ = opname,
                        attr = (pos_args, name_pair_args, annotate_args),
                        pattr = pattr,
                        offset = inst.offset,
                        linestart = inst.starts_line,
                        op = op,
                        has_arg = op_has_argument(op, op3),
                        opc = self.opc
                    )
                )
                continue
            elif op in self.varargs_ops:
                pos_args = inst.argval
                if self.is_pypy and not pos_args and opname == 'BUILD_MAP':
                    opname = 'BUILD_MAP_n'
                else:
                    opname = '%s_%d' % (opname, pos_args)
            elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'):
                # The value in the dict is in special cases in semantic actions, such
                # as CALL_FUNCTION. The value is not used in these cases, so we put
                # in arbitrary value 0.
                customize[opname] = 0
            elif opname == 'UNPACK_EX':
                # FIXME: try with scanner and parser by
                # changing inst.argval
                before_args = inst.argval & 0xFF
                after_args = (inst.argval >> 8) & 0xff
                pattr = "%d before vararg, %d after" % (before_args, after_args)
                argval = (before_args, after_args)
                opname = '%s_%d+%d' % (opname, before_args, after_args)

            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead. In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                pattr = inst.argval
                target = self.get_target(inst.offset)
                if target <= inst.offset:
                    next_opname = self.opname[self.code[inst.offset+3]]
                    if (inst.offset in self.stmts and
                        next_opname not in ('END_FINALLY', 'POP_BLOCK',
                                            # Python 3.0 only uses POP_TOP
                                            'POP_TOP')
                        and inst.offset not in self.not_continue):
                        opname = 'CONTINUE'
                    else:
                        opname = 'JUMP_BACK'
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        # There are other situations where we don't catch
                        # CONTINUE as well.
                        if tokens[-1].type == 'JUMP_BACK' and tokens[-1].attr <= argval:
                            # intern is used because we are changing the *previous* token
                            tokens[-1].type = intern('CONTINUE')

            elif op == self.opc.RETURN_VALUE:
                if inst.offset in self.return_end_ifs:
                    opname = 'RETURN_END_IF'
            elif inst.offset in self.load_asserts:
                opname = 'LOAD_ASSERT'

            tokens.append(
                Token(
                    type_ = opname,
                    attr = argval,
                    pattr = pattr,
                    offset = inst.offset,
                    linestart = inst.starts_line,
                    op = op,
                    has_arg = (op >= op3.HAVE_ARGUMENT),
                    opc = self.opc
                    )
                )
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t)
            print()
        return tokens, customize
Esempio n. 11
0
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'after'
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1

        Token = self.Token  # shortcut

        codelen = self.setup_code(co)

        self.build_lines_data(co, codelen)
        self.build_prev_op(codelen)

        free, names, varnames = self.unmangle_code_names(co, classname)
        self.names = names

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, codelen):
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
            #   assert ...
            if (self.code[i] == self.opc.JUMP_IF_TRUE and i + 4 < codelen
                    and self.code[i + 3] == self.opc.POP_TOP
                    and self.code[i + 4] == self.opc.LOAD_GLOBAL):
                if names[self.get_argument(i + 4)] == 'AssertionError':
                    self.load_asserts.add(i + 4)

        jump_targets = self.find_jump_targets(show_asm)
        # contains (code, [addrRefToCode])

        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < codelen - 1:
            if self.lines[last_stmt].next > i:
                # Distinguish "print ..." from "print ...,"
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        extended_arg = 0
        for offset in self.op_range(0, codelen):
            op = self.code[offset]
            op_name = self.opname[op]
            oparg = None
            pattr = None

            if offset in jump_targets:
                jump_idx = 0
                # We want to process COME_FROMs to the same offset to be in *descending*
                # offset order so we have the larger range or biggest instruction interval
                # last. (I think they are sorted in increasing order, but for safety
                # we sort them). That way, specific COME_FROM tags will match up
                # properly. For example, a "loop" with an "if" nested in it should have the
                # "loop" tag last so the grammar rule matches that properly.
                last_jump_offset = -1
                for jump_offset in sorted(jump_targets[offset], reverse=True):
                    if jump_offset != last_jump_offset:
                        tokens.append(
                            Token('COME_FROM',
                                  None,
                                  repr(jump_offset),
                                  offset="%s_%d" % (offset, jump_idx),
                                  has_arg=True))
                        jump_idx += 1
                        last_jump_offset = jump_offset
            elif offset in self.thens:
                tokens.append(
                    Token('THEN',
                          None,
                          self.thens[offset],
                          offset="%s_0" % offset,
                          has_arg=True))

            has_arg = (op >= self.opc.HAVE_ARGUMENT)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    extended_arg = oparg * L65536
                    continue
                if op in self.opc.CONST_OPS:
                    const = co.co_consts[oparg]
                    # We can't use inspect.iscode() because we may be
                    # using a different version of Python than the
                    # one that this was byte-compiled on. So the code
                    # types may mismatch.
                    if hasattr(const, 'co_name'):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert op_name == 'LOAD_CONST'
                            op_name = 'LOAD_LAMBDA'
                        elif const.co_name == self.genexpr_name:
                            op_name = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            op_name = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            op_name = 'LOAD_SETCOMP'
                        # verify uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' % \
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.NAME_OPS:
                    pattr = names[oparg]
                elif op in self.opc.JREL_OPS:
                    pattr = repr(offset + 3 + oparg)
                    if op == self.opc.JUMP_FORWARD:
                        target = self.get_target(offset)
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if len(tokens) and tokens[-1].type == 'JUMP_BACK':
                            tokens[-1].type = intern('CONTINUE')

                elif op in self.opc.JABS_OPS:
                    pattr = repr(oparg)
                elif op in self.opc.LOCAL_OPS:
                    pattr = varnames[oparg]
                elif op in self.opc.COMPARE_OPS:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.FREE_OPS:
                    pattr = free[oparg]
            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and
                        self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE):
                    continue
                else:
                    op_name = '%s_%d' % (op_name, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[op_name] = oparg
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead.  In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                target = self.get_target(offset)
                if target <= offset:
                    op_name = 'JUMP_BACK'
                    if (offset in self.stmts and self.code[offset + 3]
                            not in (self.opc.END_FINALLY, self.opc.POP_BLOCK)):
                        if ((offset in self.linestartoffsets
                             and tokens[-1].type == 'JUMP_BACK')
                                or offset not in self.not_continue):
                            op_name = 'CONTINUE'
                    else:
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if tokens[-1].type == 'JUMP_BACK':
                            # We need 'intern' since we have
                            # already have processed the previous
                            # token.
                            tokens[-1].type = intern('CONTINUE')

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    op_name = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    op_name = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(
                    Token(op_name, oparg, pattr, offset, linestart, op,
                          has_arg, self.opc))
            else:
                tokens.append(
                    Token(replace[offset], oparg, pattr, offset, linestart, op,
                          has_arg, self.opc))
                pass
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t.format(line_prefix='L.'))
            print()
        return tokens, customize
Esempio n. 12
0
    def disassemble(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The tranformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'before'
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1;

        Token = self.Token # shortcut

        n = self.setup_code(co)

        self.build_lines_data(co, n)
        self.build_prev_op(n)

        # self.lines contains (block,addrLastInstr)
        if classname:
            classname = '_' + classname.lstrip('_') + '__'

            def unmangle(name):
                if name.startswith(classname) and name[-2:] != '__':
                    return name[len(classname) - 2:]
                return name

            free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ]
            names = [ unmangle(name) for name in co.co_names ]
            varnames = [ unmangle(name) for name in co.co_varnames ]
        else:
            free = co.co_cellvars + co.co_freevars
            names = co.co_names
            varnames = co.co_varnames
        self.names = names

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, n):
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
            #   assert ...
            # Below we use the heuristic that it is preceded by a POP_JUMP.
            # however we could also use followed by RAISE_VARARGS
            # or for PyPy there may be a JUMP_IF_NOT_DEBUG before.
            # FIXME: remove uses of PJIF, and PJIT
            if self.is_pypy:
                have_pop_jump = self.code[i] in (self.opc.PJIF,
                                                 self.opc.PJIT)
            else:
                have_pop_jump = self.code[i] == self.opc.PJIT

            if have_pop_jump and self.code[i+3] == self.opc.LOAD_GLOBAL:
                if names[self.get_argument(i+3)] == 'AssertionError':
                    self.load_asserts.add(i+3)

        cf = self.find_jump_targets()
        # contains (code, [addrRefToCode])
        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < n-1:
            if self.lines[last_stmt].next > i:
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        extended_arg = 0
        for offset in self.op_range(0, n):
            if offset in cf:
                k = 0
                for j in cf[offset]:
                    tokens.append(Token(
                        'COME_FROM', None, repr(j),
                        offset="%s_%d" % (offset, k),
                        has_arg = True))
                    k += 1

            op = self.code[offset]
            opname = self.opc.opname[op]

            oparg = None; pattr = None
            has_arg = (op >= self.opc.HAVE_ARGUMENT)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    extended_arg = oparg * scan.L65536
                    continue
                if op in self.opc.hasconst:
                    const = co.co_consts[oparg]
                    if iscode(const):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert opname == 'LOAD_CONST'
                            opname = 'LOAD_LAMBDA'
                        elif const.co_name == '<genexpr>':
                            opname = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            opname = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            opname = 'LOAD_SETCOMP'
                        # verify() uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' %\
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.hasname:
                    pattr = names[oparg]
                elif op in self.opc.hasjrel:
                    #  use instead: hasattr(self, 'patch_continue'): ?
                    if self.version == 2.7:
                        self.patch_continue(tokens, offset, op)
                    pattr = repr(offset + 3 + oparg)
                elif op in self.opc.hasjabs:
                    # use instead: hasattr(self, 'patch_continue'): ?
                    if self.version == 2.7:
                        self.patch_continue(tokens, offset, op)
                    pattr = repr(oparg)
                elif op in self.opc.haslocal:
                    pattr = varnames[oparg]
                elif op in self.opc.hascompare:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.hasfree:
                    pattr = free[oparg]

            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if op == self.opc.BUILD_TUPLE and \
                    self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE:
                    continue
                else:
                    if self.is_pypy and not oparg and opname == 'BUILD_MAP':
                        opname = 'BUILD_MAP_n'
                    else:
                        opname = '%s_%d' % (opname, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[opname] = oparg
            elif self.is_pypy and opname in ('LOOKUP_METHOD',
                                             'JUMP_IF_NOT_DEBUG',
                                             'SETUP_EXCEPT',
                                             'SETUP_FINALLY'):
                # The value in the dict is in special cases in semantic actions, such
                # as CALL_FUNCTION. The value is not used in these cases, so we put
                # in arbitrary value 0.
                customize[opname] = 0
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead. In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                target = self.get_target(offset)
                if target <= offset:
                    if (offset in self.stmts
                        and self.code[offset+3] not in (self.opc.END_FINALLY,
                                                        self.opc.POP_BLOCK)
                        and offset not in self.not_continue):
                        opname = 'CONTINUE'
                    else:
                        opname = 'JUMP_BACK'

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    opname = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    opname = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(Token(
                    opname, oparg, pattr, offset, linestart, op,
                    has_arg, self.opc))
            else:
                tokens.append(Token(
                    replace[offset], oparg, pattr, offset, linestart,
                    op, has_arg, self.opc))
                pass
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t)
            print()
        return tokens, customize
Esempio n. 13
0
    def disassemble(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The tranformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'both'
        if show_asm in ('both', 'before'):
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1;

        self.code = array('B', co.co_code)
        self.build_lines_data(co)
        self.build_prev_op()

        bytecode = Bytecode(co, self.opc)

        # FIXME: put as its own method?
        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        bs = list(bytecode)
        n = len(bs)
        for i in range(n):
            inst = bs[i]

            # We need to detect the difference between
            # "raise AssertionError" and "assert"
            # If we have a JUMP_FORWARD after the
            # RAISE_VARARGS then we have a "raise" statement
            # else we have an "assert" statement.
            if inst.opname == 'POP_JUMP_IF_TRUE' and i+1 < n:
                next_inst = bs[i+1]
                if (next_inst.opname == 'LOAD_GLOBAL' and
                    next_inst.argval == 'AssertionError'):
                    for j in range(i+2, n):
                        raise_inst = bs[j]
                        if raise_inst.opname.startswith('RAISE_VARARGS'):
                            if j+1 >= n or bs[j+1].opname != 'JUMP_FORWARD':
                                self.load_asserts.add(next_inst.offset)
                                pass
                            break
                    pass
                pass

        # Get jump targets
        # Format: {target offset: [jump offsets]}
        jump_targets = self.find_jump_targets()

        for inst in bytecode:

            argval = inst.argval
            if inst.offset in jump_targets:
                jump_idx = 0
                for jump_offset in jump_targets[inst.offset]:
                    tokens.append(Token('COME_FROM', None, repr(jump_offset),
                                        offset='%s_%s' % (inst.offset, jump_idx),
                                        has_arg = True, opc=self.opc))
                    jump_idx += 1
                    pass
                pass

            pattr =  inst.argrepr
            opname = inst.opname
            op = inst.opcode

            if opname in ['LOAD_CONST']:
                const = inst.argval
                if iscode(const):
                    if const.co_name == '<lambda>':
                        opname = 'LOAD_LAMBDA'
                    elif const.co_name == '<genexpr>':
                        opname = 'LOAD_GENEXPR'
                    elif const.co_name == '<dictcomp>':
                        opname = 'LOAD_DICTCOMP'
                    elif const.co_name == '<setcomp>':
                        opname = 'LOAD_SETCOMP'
                    elif const.co_name == '<listcomp>':
                        opname = 'LOAD_LISTCOMP'
                    # verify() uses 'pattr' for comparison, since 'attr'
                    # now holds Code(const) and thus can not be used
                    # for comparison (todo: think about changing this)
                    # pattr = 'code_object @ 0x%x %s->%s' %\
                    # (id(const), const.co_filename, const.co_name)
                    pattr = '<code_object ' + const.co_name + '>'
                else:
                    pattr = const
                    pass
            elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'):
                pos_args, name_pair_args, annotate_args = parse_fn_counts(inst.argval)
                if name_pair_args > 0:
                    opname = '%s_N%d' % (opname, name_pair_args)
                    pass
                if annotate_args > 0:
                    opname = '%s_A_%d' % [opname, annotate_args]
                    pass
                opname = '%s_%d' % (opname, pos_args)
                pattr = ("%d positional, %d keyword pair, %d annotated" %
                             (pos_args, name_pair_args, annotate_args))
                tokens.append(
                    Token(
                        type_ = opname,
                        attr = (pos_args, name_pair_args, annotate_args),
                        pattr = pattr,
                        offset = inst.offset,
                        linestart = inst.starts_line,
                        op = op,
                        has_arg = op_has_argument(op, op3),
                        opc = self.opc
                    )
                )
                continue
            elif op in self.varargs_ops:
                pos_args = inst.argval
                if self.is_pypy and not pos_args and opname == 'BUILD_MAP':
                    opname = 'BUILD_MAP_n'
                else:
                    opname = '%s_%d' % (opname, pos_args)
            elif self.is_pypy and opname in ('CALL_METHOD', 'JUMP_IF_NOT_DEBUG'):
                # The value in the dict is in special cases in semantic actions, such
                # as CALL_FUNCTION. The value is not used in these cases, so we put
                # in arbitrary value 0.
                customize[opname] = 0
            elif opname == 'UNPACK_EX':
                # FIXME: try with scanner and parser by
                # changing inst.argval
                before_args = inst.argval & 0xFF
                after_args = (inst.argval >> 8) & 0xff
                pattr = "%d before vararg, %d after" % (before_args, after_args)
                argval = (before_args, after_args)
                opname = '%s_%d+%d' % (opname, before_args, after_args)
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead. In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                pattr = inst.argval
                target = self.get_target(inst.offset)
                if target <= inst.offset:
                    next_opname = self.opname[self.code[inst.offset+3]]
                    if (inst.offset in self.stmts and
                        next_opname not in ('END_FINALLY', 'POP_BLOCK')
                        and inst.offset not in self.not_continue):
                        opname = 'CONTINUE'
                    else:
                        opname = 'JUMP_BACK'
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        # There are other situations were we don't catch
                        # CONTINUE as well.
                        if tokens[-1].type == 'JUMP_BACK':
                            tokens[-1].type = intern('CONTINUE')

            elif op == self.opc.RETURN_VALUE:
                if inst.offset in self.return_end_ifs:
                    opname = 'RETURN_END_IF'
            elif inst.offset in self.load_asserts:
                opname = 'LOAD_ASSERT'

            tokens.append(
                Token(
                    type_ = opname,
                    attr = argval,
                    pattr = pattr,
                    offset = inst.offset,
                    linestart = inst.starts_line,
                    op = op,
                    has_arg = (op >= op3.HAVE_ARGUMENT),
                    opc = self.opc
                    )
                )
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t)
            print()
        return tokens, customize
Esempio n. 14
0
    def disassemble(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Disassemble a Python 2 code object, returning a list of 'Token'.
        Various tranformations are made to assist the deparsing grammar.
        For example:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional aruments
        The main part of this procedure is modelled after
        dis.disassemble().
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'before'
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        Token = self.Token # shortcut

        n = self.setup_code(co)

        self.build_lines_data(co, n)
        self.build_prev_op(n)

        # self.lines contains (block,addrLastInstr)
        if classname:
            classname = '_' + classname.lstrip('_') + '__'

            def unmangle(name):
                if name.startswith(classname) and name[-2:] != '__':
                    return name[len(classname) - 2:]
                return name

            free = [ unmangle(name) for name in (co.co_cellvars + co.co_freevars) ]
            names = [ unmangle(name) for name in co.co_names ]
            varnames = [ unmangle(name) for name in co.co_varnames ]
        else:
            free = co.co_cellvars + co.co_freevars
            names = co.co_names
            varnames = co.co_varnames
        self.names = names

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, n):
            # We need to detect the difference between
            # "raise AssertionError" and
            # "assert"
            if self.code[i] == self.opc.PJIT and self.code[i+3] == self.opc.LOAD_GLOBAL:
                if names[self.get_argument(i+3)] == 'AssertionError':
                    self.load_asserts.add(i+3)

        cf = self.find_jump_targets()
        # contains (code, [addrRefToCode])
        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < n-1:
            if self.lines[last_stmt].next > i:
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        extended_arg = 0
        for offset in self.op_range(0, n):
            if offset in cf:
                k = 0
                for j in cf[offset]:
                    tokens.append(Token(
                        'COME_FROM', None, repr(j),
                        offset="%s_%d" % (offset, k),
                        has_arg = True))
                    k += 1

            op = self.code[offset]
            opname = self.opc.opname[op]

            oparg = None; pattr = None
            has_arg = (op >= self.opc.HAVE_ARGUMENT)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    extended_arg = oparg * scan.L65536
                    continue
                if op in self.opc.hasconst:
                    const = co.co_consts[oparg]
                    if iscode(const):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert opname == 'LOAD_CONST'
                            opname = 'LOAD_LAMBDA'
                        elif const.co_name == '<genexpr>':
                            opname = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            opname = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            opname = 'LOAD_SETCOMP'
                        # verify() uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' %\
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.hasname:
                    pattr = names[oparg]
                elif op in self.opc.hasjrel:
                    pattr = repr(offset + 3 + oparg)
                elif op in self.opc.hasjabs:
                    pattr = repr(oparg)
                elif op in self.opc.haslocal:
                    pattr = varnames[oparg]
                elif op in self.opc.hascompare:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.hasfree:
                    pattr = free[oparg]

            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if op == self.opc.BUILD_TUPLE and \
                    self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE:
                    continue
                else:
                    opname = '%s_%d' % (opname, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[opname] = oparg
            elif op == self.opc.JUMP_ABSOLUTE:
                target = self.get_target(offset)
                if target < offset:
                    if (offset in self.stmts
                        and self.code[offset+3] not in (self.opc.END_FINALLY,
                                                        self.opc.POP_BLOCK)
                        and offset not in self.not_continue):
                        opname = 'CONTINUE'
                    else:
                        opname = 'JUMP_BACK'

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    opname = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    opname = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(Token(
                    opname, oparg, pattr, offset, linestart, op, has_arg))
            else:
                tokens.append(Token(
                    replace[offset], oparg, pattr, offset, linestart, op, has_arg))
                pass
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t.format())
            print()
        return tokens, customize
Esempio n. 15
0
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 'Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'after'
        if show_asm in ('both', 'before'):
            from xdis.bytecode import Bytecode
            bytecode = Bytecode(co, self.opc)
            for instr in bytecode.get_instructions(co):
                print(instr._disassemble())

        # Container for tokens
        tokens = []

        customize = {}
        if self.is_pypy:
            customize['PyPy'] = 1

        Token = self.Token # shortcut

        codelen = self.setup_code(co)

        self.build_lines_data(co, codelen)
        self.build_prev_op(codelen)

        free, names, varnames = self.unmangle_code_names(co, classname)
        self.names = names

        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        for i in self.op_range(0, codelen):
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
            #   assert ...
            if (self.code[i] == self.opc.JUMP_IF_TRUE and
                i + 4 < codelen and
                self.code[i+3] == self.opc.POP_TOP and
                self.code[i+4] == self.opc.LOAD_GLOBAL):
                if names[self.get_argument(i+4)] == 'AssertionError':
                    self.load_asserts.add(i+4)

        jump_targets = self.find_jump_targets(show_asm)
        # contains (code, [addrRefToCode])

        last_stmt = self.next_stmt[0]
        i = self.next_stmt[last_stmt]
        replace = {}
        while i < codelen - 1:
            if self.lines[last_stmt].next > i:
                # Distinguish "print ..." from "print ...,"
                if self.code[last_stmt] == self.opc.PRINT_ITEM:
                    if self.code[i] == self.opc.PRINT_ITEM:
                        replace[i] = 'PRINT_ITEM_CONT'
                    elif self.code[i] == self.opc.PRINT_NEWLINE:
                        replace[i] = 'PRINT_NEWLINE_CONT'
            last_stmt = i
            i = self.next_stmt[i]

        extended_arg = 0
        for offset in self.op_range(0, codelen):
            op = self.code[offset]
            op_name = self.opname[op]
            oparg = None; pattr = None

            if offset in jump_targets:
                jump_idx = 0
                # We want to process COME_FROMs to the same offset to be in *descending*
                # offset order so we have the larger range or biggest instruction interval
                # last. (I think they are sorted in increasing order, but for safety
                # we sort them). That way, specific COME_FROM tags will match up
                # properly. For example, a "loop" with an "if" nested in it should have the
                # "loop" tag last so the grammar rule matches that properly.
                last_jump_offset = -1
                for jump_offset  in sorted(jump_targets[offset], reverse=True):
                    if jump_offset != last_jump_offset:
                        tokens.append(Token(
                            'COME_FROM', None, repr(jump_offset),
                            offset="%s_%d" % (offset, jump_idx),
                            has_arg = True))
                        jump_idx += 1
                        last_jump_offset = jump_offset
            elif offset in self.thens:
                tokens.append(Token(
                    'THEN', None, self.thens[offset],
                    offset="%s_0" % offset,
                    has_arg = True))

            has_arg = (op >= self.opc.HAVE_ARGUMENT)
            if has_arg:
                oparg = self.get_argument(offset) + extended_arg
                extended_arg = 0
                if op == self.opc.EXTENDED_ARG:
                    raise NotImplementedError
                    extended_arg = oparg * scan.L65536
                    continue
                if op in self.opc.hasconst:
                    const = co.co_consts[oparg]
                    # We can't use inspect.iscode() because we may be
                    # using a different version of Python than the
                    # one that this was byte-compiled on. So the code
                    # types may mismatch.
                    if hasattr(const, 'co_name'):
                        oparg = const
                        if const.co_name == '<lambda>':
                            assert op_name == 'LOAD_CONST'
                            op_name = 'LOAD_LAMBDA'
                        elif const.co_name == self.genexpr_name:
                            op_name = 'LOAD_GENEXPR'
                        elif const.co_name == '<dictcomp>':
                            op_name = 'LOAD_DICTCOMP'
                        elif const.co_name == '<setcomp>':
                            op_name = 'LOAD_SETCOMP'
                        # verify uses 'pattr' for comparison, since 'attr'
                        # now holds Code(const) and thus can not be used
                        # for comparison (todo: think about changing this)
                        # pattr = 'code_object @ 0x%x %s->%s' % \
                        # (id(const), const.co_filename, const.co_name)
                        pattr = '<code_object ' + const.co_name + '>'
                    else:
                        pattr = const
                elif op in self.opc.hasname:
                    pattr = names[oparg]
                elif op in self.opc.hasjrel:
                    pattr = repr(offset + 3 + oparg)
                    if op == self.opc.JUMP_FORWARD:
                        target = self.get_target(offset)
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if len(tokens) and tokens[-1].type == 'JUMP_BACK':
                            tokens[-1].type = intern('CONTINUE')

                elif op in self.opc.hasjabs:
                    pattr = repr(oparg)
                elif op in self.opc.haslocal:
                    pattr = varnames[oparg]
                elif op in self.opc.hascompare:
                    pattr = self.opc.cmp_op[oparg]
                elif op in self.opc.hasfree:
                    pattr = free[oparg]
            if op in self.varargs_ops:
                # CE - Hack for >= 2.5
                #      Now all values loaded via LOAD_CLOSURE are packed into
                #      a tuple before calling MAKE_CLOSURE.
                if (self.version >= 2.5 and op == self.opc.BUILD_TUPLE and
                    self.code[self.prev[offset]] == self.opc.LOAD_CLOSURE):
                    continue
                else:
                    op_name = '%s_%d' % (op_name, oparg)
                    if op != self.opc.BUILD_SLICE:
                        customize[op_name] = oparg
            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead.  In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                target = self.get_target(offset)
                if target <= offset:
                    op_name = 'JUMP_BACK'
                    if (offset in self.stmts
                        and self.code[offset+3] not in (self.opc.END_FINALLY,
                                                          self.opc.POP_BLOCK)):
                        if ((offset in self.linestartoffsets and
                            tokens[-1].type == 'JUMP_BACK')
                            or offset not in self.not_continue):
                            op_name = 'CONTINUE'
                    else:
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        if tokens[-1].type == 'JUMP_BACK':
                            # We need 'intern' since we have
                            # already have processed the previous
                            # token.
                            tokens[-1].type = intern('CONTINUE')

            elif op == self.opc.LOAD_GLOBAL:
                if offset in self.load_asserts:
                    op_name = 'LOAD_ASSERT'
            elif op == self.opc.RETURN_VALUE:
                if offset in self.return_end_ifs:
                    op_name = 'RETURN_END_IF'

            if offset in self.linestartoffsets:
                linestart = self.linestartoffsets[offset]
            else:
                linestart = None

            if offset not in replace:
                tokens.append(Token(
                    op_name, oparg, pattr, offset, linestart, op,
                    has_arg, self.opc))
            else:
                tokens.append(Token(
                    replace[offset], oparg, pattr, offset, linestart, op,
                    has_arg, self.opc))
                pass
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t.format(line_prefix='L.'))
            print()
        return tokens, customize
Esempio n. 16
0
    def ingest(self, co, classname=None, code_objects={}, show_asm=None):
        """
        Pick out tokens from an uncompyle6 code object, and transform them,
        returning a list of uncompyle6 Token's.

        The transformations are made to assist the deparsing grammar.
        Specificially:
           -  various types of LOAD_CONST's are categorized in terms of what they load
           -  COME_FROM instructions are added to assist parsing control structures
           -  MAKE_FUNCTION and FUNCTION_CALLS append the number of positional arguments
           -  some EXTENDED_ARGS instructions are removed

        Also, when we encounter certain tokens, we add them to a set which will cause custom
        grammar rules. Specifically, variable arg tokens like MAKE_FUNCTION or BUILD_LIST
        cause specific rules for the specific number of arguments they take.
        """

        # FIXME: remove this when all subsidiary functions have been removed.
        # We should be able to get everything from the self.insts list.
        self.code = array('B', co.co_code)

        bytecode = Bytecode(co, self.opc)
        show_asm = self.show_asm if not show_asm else show_asm
        # show_asm = 'both'
        if show_asm in ('both', 'before'):
            for instr in bytecode.get_instructions(co):
                print(instr.disassemble())

        # list of tokens/instructions
        tokens = []

        # "customize" is a dict whose keys are nonterminals
        # and the value is the argument stack entries for that
        # nonterminal. The count is a little hoaky. It is mostly
        # not used, but sometimes it is.
        # "customize" is a dict whose keys are nonterminals
        customize = {}

        if self.is_pypy:
            customize['PyPy'] = 0

        self.build_lines_data(co)
        self.build_prev_op()

        # FIXME: put as its own method?
        # Scan for assertions. Later we will
        # turn 'LOAD_GLOBAL' to 'LOAD_ASSERT'.
        # 'LOAD_ASSERT' is used in assert statements.
        self.load_asserts = set()
        self.insts = list(bytecode)
        n = len(self.insts)
        for i, inst in enumerate(self.insts):
            # We need to detect the difference between:
            #   raise AssertionError
            #  and
            #   assert ...
            # If we have a JUMP_FORWARD after the
            # RAISE_VARARGS then we have a "raise" statement
            # else we have an "assert" statement.
            if inst.opname == 'POP_JUMP_IF_TRUE' and i + 1 < n:
                next_inst = self.insts[i + 1]
                if (next_inst.opname == 'LOAD_GLOBAL'
                        and next_inst.argval == 'AssertionError'):
                    if (i + 2 < n and
                            self.insts[i +
                                       2].opname.startswith('RAISE_VARARGS')):
                        self.load_asserts.add(next_inst.offset)
                    pass
                pass

        # Get jump targets
        # Format: {target offset: [jump offsets]}
        jump_targets = self.find_jump_targets(show_asm)
        # print("XXX2", jump_targets)

        last_op_was_break = False

        for i, inst in enumerate(bytecode):

            argval = inst.argval
            op = inst.opcode
            if op == self.opc.EXTENDED_ARG:
                # FIXME: The EXTENDED_ARG is used to signal annotation
                # parameters
                if self.insts[i + 1].opcode != self.opc.MAKE_FUNCTION:
                    continue

            if inst.offset in jump_targets:
                jump_idx = 0
                # We want to process COME_FROMs to the same offset to be in *descending*
                # offset order so we have the larger range or biggest instruction interval
                # last. (I think they are sorted in increasing order, but for safety
                # we sort them). That way, specific COME_FROM tags will match up
                # properly. For example, a "loop" with an "if" nested in it should have the
                # "loop" tag last so the grammar rule matches that properly.
                for jump_offset in sorted(jump_targets[inst.offset],
                                          reverse=True):
                    come_from_name = 'COME_FROM'
                    opname = self.opname_for_offset(jump_offset)
                    if opname.startswith('SETUP_'):
                        come_from_type = opname[len('SETUP_'):]
                        come_from_name = 'COME_FROM_%s' % come_from_type
                        pass
                    elif inst.offset in self.except_targets:
                        come_from_name = 'COME_FROM_EXCEPT_CLAUSE'
                    tokens.append(
                        Token(come_from_name,
                              None,
                              repr(jump_offset),
                              offset='%s_%s' % (inst.offset, jump_idx),
                              has_arg=True,
                              opc=self.opc))
                    jump_idx += 1
                    pass
                pass
            elif inst.offset in self.else_start:
                end_offset = self.else_start[inst.offset]
                tokens.append(
                    Token('ELSE',
                          None,
                          repr(end_offset),
                          offset='%s' % (inst.offset),
                          has_arg=True,
                          opc=self.opc))

                pass

            pattr = inst.argrepr
            opname = inst.opname

            if op in self.opc.CONST_OPS:
                const = argval
                if iscode(const):
                    if const.co_name == '<lambda>':
                        assert opname == 'LOAD_CONST'
                        opname = 'LOAD_LAMBDA'
                    elif const.co_name == '<genexpr>':
                        opname = 'LOAD_GENEXPR'
                    elif const.co_name == '<dictcomp>':
                        opname = 'LOAD_DICTCOMP'
                    elif const.co_name == '<setcomp>':
                        opname = 'LOAD_SETCOMP'
                    elif const.co_name == '<listcomp>':
                        opname = 'LOAD_LISTCOMP'
                    # verify() uses 'pattr' for comparison, since 'attr'
                    # now holds Code(const) and thus can not be used
                    # for comparison (todo: think about changing this)
                    # pattr = 'code_object @ 0x%x %s->%s' %\
                    # (id(const), const.co_filename, const.co_name)
                    pattr = '<code_object ' + const.co_name + '>'
                else:
                    pattr = const
                    pass
            elif opname in ('MAKE_FUNCTION', 'MAKE_CLOSURE'):
                if self.version >= 3.6:
                    # 3.6+ doesn't have MAKE_CLOSURE, so opname == 'MAKE_FUNCTION'
                    flags = argval
                    opname = 'MAKE_FUNCTION_%d' % (flags)
                    attr = []
                    for flag in self.MAKE_FUNCTION_FLAGS:
                        bit = flags & 1
                        if bit:
                            if pattr:
                                pattr += ", " + flag
                            else:
                                pattr += flag
                        attr.append(bit)
                        flags >>= 1
                    attr = attr[:4]  # remove last value: attr[5] == False
                else:
                    pos_args, name_pair_args, annotate_args = parse_fn_counts(
                        inst.argval)
                    pattr = ("%d positional, %d keyword pair, %d annotated" %
                             (pos_args, name_pair_args, annotate_args))
                    if name_pair_args > 0:
                        opname = '%s_N%d' % (opname, name_pair_args)
                        pass
                    if annotate_args > 0:
                        opname = '%s_A_%d' % (opname, annotate_args)
                        pass
                    opname = '%s_%d' % (opname, pos_args)
                    attr = (pos_args, name_pair_args, annotate_args)
                tokens.append(
                    Token(opname=opname,
                          attr=attr,
                          pattr=pattr,
                          offset=inst.offset,
                          linestart=inst.starts_line,
                          op=op,
                          has_arg=inst.has_arg,
                          opc=self.opc))
                continue
            elif op in self.varargs_ops:
                pos_args = argval
                if self.is_pypy and not pos_args and opname == 'BUILD_MAP':
                    opname = 'BUILD_MAP_n'
                else:
                    opname = '%s_%d' % (opname, pos_args)
            elif self.is_pypy and opname in ('CALL_METHOD',
                                             'JUMP_IF_NOT_DEBUG'):
                # The value in the dict is in special cases in semantic actions, such
                # as CALL_FUNCTION. The value is not used in these cases, so we put
                # in arbitrary value 0.
                customize[opname] = 0
            elif opname == 'UNPACK_EX':
                # FIXME: try with scanner and parser by
                # changing argval
                before_args = argval & 0xFF
                after_args = (argval >> 8) & 0xff
                pattr = "%d before vararg, %d after" % (before_args,
                                                        after_args)
                argval = (before_args, after_args)
                opname = '%s_%d+%d' % (opname, before_args, after_args)

            elif op == self.opc.JUMP_ABSOLUTE:
                # Further classify JUMP_ABSOLUTE into backward jumps
                # which are used in loops, and "CONTINUE" jumps which
                # may appear in a "continue" statement.  The loop-type
                # and continue-type jumps will help us classify loop
                # boundaries The continue-type jumps help us get
                # "continue" statements with would otherwise be turned
                # into a "pass" statement because JUMPs are sometimes
                # ignored in rules as just boundary overhead. In
                # comprehensions we might sometimes classify JUMP_BACK
                # as CONTINUE, but that's okay since we add a grammar
                # rule for that.
                pattr = argval
                # FIXME: 0 isn't always correct
                target = self.get_target(inst.offset)
                if target <= inst.offset:
                    next_opname = self.opname[self.code[inst.offset + 3]]
                    if (inst.offset in self.stmts and
                        (self.version != 3.0 or (hasattr(inst, 'linestart')))
                            and (next_opname not in (
                                'END_FINALLY',
                                'POP_BLOCK',
                                # Python 3.0 only uses POP_TOP
                                'POP_TOP'))):
                        opname = 'CONTINUE'
                    else:
                        opname = 'JUMP_BACK'
                        # FIXME: this is a hack to catch stuff like:
                        #   if x: continue
                        # the "continue" is not on a new line.
                        # There are other situations where we don't catch
                        # CONTINUE as well.
                        if tokens[-1].kind == 'JUMP_BACK' and tokens[
                                -1].attr <= argval:
                            if tokens[-2].kind == 'BREAK_LOOP':
                                del tokens[-1]
                            else:
                                # intern is used because we are changing the *previous* token
                                tokens[-1].kind = intern('CONTINUE')
                    if last_op_was_break and opname == 'CONTINUE':
                        last_op_was_break = False
                        continue

            # FIXME: go over for Python 3.6+. This is sometimes wrong
            elif op == self.opc.RETURN_VALUE:
                if inst.offset in self.return_end_ifs:
                    opname = 'RETURN_END_IF'

            elif inst.offset in self.load_asserts:
                opname = 'LOAD_ASSERT'

            last_op_was_break = opname == 'BREAK_LOOP'
            tokens.append(
                Token(opname=opname,
                      attr=argval,
                      pattr=pattr,
                      offset=inst.offset,
                      linestart=inst.starts_line,
                      op=op,
                      has_arg=inst.has_arg,
                      opc=self.opc))
            pass

        if show_asm in ('both', 'after'):
            for t in tokens:
                print(t)
            print()
        return tokens, customize