def decrypt_file(self, data) -> Union[bytes, None]: CRYPT_BLOCK_SIZE = 16 initialization_vector = data[:CRYPT_BLOCK_SIZE] if not self.encryption_key: while self.potential_keys: encryption_key = self.potential_keys.pop(0) try: cipher: AES.AESCipher = AES.new(encryption_key.encode(), AES.MODE_CFB, initialization_vector) decrypted_data = cipher.decrypt( data[CRYPT_BLOCK_SIZE:] ) # will silently fail if password is wrong _ = zlib.decompress( decrypted_data) # ensures the password is correct except zlib.error as e: logger.debug( f"[!] Decryption of .pyc failed with password {encryption_key}. Discarding key." ) else: self.encryption_key = encryption_key logger.debug( f"[!] Verified ZlibArchive password is {self.encryption_key}." ) return decrypted_data else: try: cipher: AES.AESCipher = AES.new(self.encryption_key.encode(), AES.MODE_CFB, initialization_vector) return cipher.decrypt(data[CRYPT_BLOCK_SIZE:]) except zlib.error as e: logger.error(f"[!] Failed to decrypt .pyc with error: {e}") return None
def check_for_password_file(self): self.potential_keys = [] if hasattr(self, "archive_path"): dir_of_pyz = self.archive_path.parent else: dir_of_pyz = Path.cwd() key_file = dir_of_pyz / "pyimod00_crypto_key.pyc" if key_file.exists(): self.encrypted = True logger.debug( f"[+] Found ZlibArchive encryption key file at path {key_file}" ) crypto_key_filename: str # full path of try: ( crypto_key_filename, crypto_key_co, crypto_key_python_version, crypto_key_compilation_timestamp, crypto_key_magic_int, crypto_key_is_pypy, crypto_key_source_size, crypto_key_sip_hash, ) = disassemble_file(str(key_file), outstream=open(os.devnull, "w")) except Exception as e: logger.warning( f"[!] Could not disassemble file {key_file}. Received error: {e}" ) else: self.compilation_time = datetime.fromtimestamp( crypto_key_compilation_timestamp) for const_string in crypto_key_co.co_consts: if const_string and len(const_string) == 16: self.potential_keys.append(const_string) # If we couldn't decompile the file to see the consts, lets just search the raw bytes of the file # for the password if not self.potential_keys: with key_file.open("rb") as file_ptr: file_strings = utils.parse_for_strings(file_ptr.read()) s: str for s in file_strings: if len(s) >= 16 and "pyimod00_crypto_key" not in s: while len(s) >= 16: self.potential_keys.append(s[0:16]) s = s[1:] logger.info( f"[*] Found these potential PyInstaller PYZ Archive encryption keys: {self.potential_keys}" ) if not self.potential_keys: logger.error( f"[*] Encryption key file detected, however no password was able to be retrieved." )
def parse_toc(self) -> None: self.magic_int = magic2int(self.archive_contents[4:8]) (toc_position, ) = struct.unpack("!i", self.archive_contents[8:12]) self.toc = xdis.unmarshal.load_code( self.archive_contents[toc_position:], self.magic_int) # TODO wrap this in try block? logger.debug(f"[*] Found {len(self.toc)} entries in this PYZ archive") # From PyInstaller 3.1+ toc is a list of tuples if isinstance(self.toc, list): self.toc = dict(self.toc)
def unpack(self) -> None: """Dump any interesting aspects of this PE for further investigation. This will log the PEs version info resource for manual inspection, dump any Authenticode certificates, and look for frozen Python artifacts within the PE's resources and overlay. """ self.load_version_info() self.dump_certificates() unpack_me: List[pathlib.Path] = [] overlay_path: pathlib.Path = self.dump_overlay() if overlay_path: unpack_me.append(overlay_path) version_strings: List[str] = utils.parse_for_version_strings( self.file_contents) if version_strings: logger.debug( "[*] Found the following strings (and their surrounding bytes, for context) in this PE, which may " "indicate the version of Python used to freeze the executable: \n" f"{pprint.pformat(version_strings, width=120)}") pythonscript_idx: int = None if hasattr(self.pe, "DIRECTORY_ENTRY_RESOURCE"): entry: pefile.ResourceDirEntryData for entry in self.pe.DIRECTORY_ENTRY_RESOURCE.entries: if entry.name is None: continue resource_name: str = entry.name.string.decode() if any([ True for pattern in self.INTERESTING_RESOURCES if re.match(pattern, resource_name, re.I) ]): resource_path: pathlib.Path = self.dump_resource( resource_name) if resource_name == "PYTHONSCRIPT": pythonscript_idx = len(unpack_me) unpack_me.append(resource_path) if pythonscript_idx: # We want to unpack Py2Exe PYTHONSCRIPT last to give it highest chance of successfully determining version. unpack_me.append(unpack_me.pop(pythonscript_idx)) artifact_path: pathlib.Path for artifact_path in unpack_me: output_dir_name: str = utils.slugify( str(artifact_path.name) + "_output") pydecipher.unpack( artifact_path, output_dir=self.output_dir.joinpath(output_dir_name), **self.kwargs, )
def opcode_constants_remap( opcode_file: pathlib.Path, provided_version: str = None) -> Tuple[Dict[int, Dict[int, int]], str]: """Parse code object constants to try and recreate opcode mappings. This method walks the constants attribute of the opcode.pyc code object. See the remap documentation for more information on this method. Parameters ---------- opcode_file: pathlib.Path The path on disk to the opcode.pyc file. provided_version: str, optional The version of Python that this opcode file corresponds to. Returns ------- Tuple[Dict[int, Dict[int, int]], str] A tuple containing a dictionary of original_opcode to Dict[replacement_opcode:replacement_count] and the opmap's Python version. replacement_opcode is an opcode that was seen in place of original_opcode, and the replacement_count is the amount of times it was seen replacing the original_opcode throughout all the bytecode that was analyzed. """ def get_nearest_opcode(opname: str, unused_opcodes: List[int], version: str) -> int: xdis_opcode: ModuleType try: xdis_opcode = xdis.disasm.get_opcode(version, is_pypy=False) actual_opcode = getattr(xdis_opcode, opname) except Exception: return unused_opcodes[0] smallest_distance: int = 999 closest_opcode: int = -1 for opcode in unused_opcodes: if abs(actual_opcode - opcode) < smallest_distance: closest_opcode = opcode smallest_distance = abs(actual_opcode - opcode) return closest_opcode logger.debug( f"[*] Checking opcode.pyc file at {str(opcode_file)} to determine if opcode map is normal." ) fixed_pyc_file: tempfile.NamedTemporaryFile if fixed_pyc_file := artifact_types.pyc.Pyc.check_and_fix_pyc( opcode_file, provided_version=provided_version): logger.error( f"[+] Duplicated opcode file {str(opcode_file)} to correct issues with the pyc. New filepath:" f" {fixed_pyc_file.name}") opcode_file = fixed_pyc_file.name
def check_for_our_xdis() -> None: """Check that the pydecipher fork of xdis is installed. Exits if its not. """ if hasattr(xdis.op_imports, "remap_opcodes"): logger.debug("[*] Custom version of xdis detected. All clear to proceed.") else: logger.error( "[!] It seems that the public/normal version of xdis has been installed. Please see the documentation" "on how to download the pydecipher-customized fork of xdis." ) sys.exit(1)
def validate_pyinstaller_carchive(self): self.magic_index = self.archive_contents.find(self.MAGIC) cookie_size = len(self.archive_contents) - self.magic_index if self.magic_index > 0: if cookie_size == self.PYINST20_COOKIE_SIZE: self.pyinstaller_version = 2.0 logger.debug("[*] PyInstaller version: 2.0") return True elif cookie_size == self.PYINST21_COOKIE_SIZE: self.pyinstaller_version = 2.1 # or greater return True logger.debug("[*] PyInstaller version: 2.1") else: logger.debug( f"[!] PyInstaller cookie size is {cookie_size}, which does not correspond to a known " "version of PyInstaller.") if cookie_size < 100: # Some valid cookies were seen with size 94 self.pyinstaller_version = "unknown" return True else: return False else: logger.debug( "[!] Could not find PyInstaller magic within this archive.") return False
def unpack(python_artifact: os.PathLike, output_dir: str = None, **kwargs) -> None: """Recursively extract interesting resources from the Python artifact. This function will cycle through all the registered ARTIFACT_TYPES. See usages of :py:meth:`pydecipher.__init__.register` for the creation of this list. ARTIFACT_TYPES consists of the different 'unpackable', registered (via decorator) Python artifact classes in a dictionary of the format <Artifact_Name : Class Instance of Artifact_Name>. A class's constructor should raise a TypeError if is being instantiated with something that isn't the correct type (i.e. Py2Exe resource being passed to a PyInstaller archive constructor). Parameters ---------- python_artifact : pathlib.Path or io.IOBase (file-like object) The path to the Python artifact output_dir : str, optional Where to dump the extracted output of artifact parsers. If no directory is specified, a directory will be created in the current working directory. **kwargs Arbitrary keyword arguments. Including, but not limited to: version_hint: str The (potential) Python version of the artifact. If you know the version, you should pass it in. Otherwise, pydecipher will try to automatically figure out what version was used through string-analysis (and possibly brute-force decompilation). If """ if output_dir: output_dir: pathlib.Path = pathlib.Path(output_dir).resolve() type_instance: type = None logger.info(f"[*] Unpacking {python_artifact}") for type_, class_ in pydecipher.ARTIFACT_TYPES.items(): logger.debug( f"[*] Checking {type_} magic for file {python_artifact.name}") try: type_instance = class_(python_artifact, output_dir=output_dir, **kwargs) logger.debug( f"[*] Determined {python_artifact.name} type to be {type_}") break except TypeError: logger.debug(f"[*] Magic incorrect for type {type_}") else: # This should never be reached logger.debug( "[!] No artifact types found! Something went wrong. Please submit a bug report." ) if type_instance: type_instance.unpack()
def disassemble_and_dump(self, brute_force: bool = False): code_bytes = self.resource_contents[self.marshalled_obj_start_idx:] hijacked_stderr = io.StringIO() with redirect_stderr(hijacked_stderr): try: # TODO make this more specific error catching code_objects = load_code(code_bytes, self.magic_num) if not isinstance(code_objects, list): # TODO make this a non-generic error raise RuntimeError( "Py2Exe should return a marshalled list of code objects" ) if not all(code_objects): raise RuntimeError("NoneType code objects returned") except Exception: logger.debug( f"[!] Failed to produce disassembly of bytecode with magic num {self.magic_num} " f"(Python version {magicint2version[self.magic_num]})") self.magic_num = -1 return else: logger.info( f"[+] Successfully disassembled bytecode with magic number {self.magic_num}, " f"corresponding to Python version {magicint2version[self.magic_num]}" ) for co in code_objects: new_filename: str = self._clean_filename(co.co_filename) self.output_dir.mkdir(parents=True, exist_ok=True) if brute_force: bytecode_filepath: str = self.output_dir / magicint2version[ self.magic_num] / new_filename bytecode_filepath.parent.mkdir(exist_ok=True) else: bytecode_filepath: str = str( self.output_dir.joinpath(new_filename)) try: xdis.load.write_bytecode_file(bytecode_filepath, co, self.magic_num) except Exception as e: logger.error( f"[!] Could not write file {bytecode_filepath.name} with error: {e}" ) else: logger.info( f"[+] Successfully wrote file {new_filename} to {self.output_dir}" )
def load_version_info(self, quiet: bool = False) -> None: """Extract the VersionInfo dictionary from the pefile.PE object. If pydecipher is running in anything but 'quiet' mode, it will print the version info to the log. Additionally, it will search for Python version strings within the version info. Parameters ---------- quiet : bool, optional Whether or not to print the version info dictionary to the log. """ if not hasattr(self.pe, "FileInfo"): return structure: pefile.Structure for structure in self.pe.FileInfo: sub_structure: pefile.Structure for sub_structure in structure: if sub_structure.Key != b"StringFileInfo": continue if hasattr(sub_structure, "StringTable"): string_table: pefile.Structure for string_table in sub_structure.StringTable: if string_table.entries: self.version_info = { x.decode("utf-8"): y.decode("utf-8") for x, y in string_table.entries.items() } formatted_version_info: Dict[str, str] = json.dumps(self.version_info, indent=4, separators=(",", ": ")) if not quiet: logger.debug( f"[*] This PE had the following VersionInfo resource: {formatted_version_info}" ) if "python" in str(self.version_info).lower(): if "FileVersion" in self.version_info: self.python_version = self.version_info["FileVersion"] if "ProductVersion" in self.version_info: if self.python_version and len(self.python_version) < len( self.version_info["ProductVersion"]): # assume longer string means more detailed version info (we'd rather know it was 2.7.14 vs just 2.7) self.python_version = self.version_info["ProductVersion"]
def extract_files(self) -> None: decompression_errors = 0 successfully_extracted = 0 for key in self.toc.keys(): (type_code, position, compressed_data_size) = self.toc[key] if not hasattr(self, "compilation_time"): timestamp = None else: timestamp = self.compilation_time header_bytes = pydecipher.bytecode.create_pyc_header( self.magic_int, compilation_ts=timestamp, file_size=0) compressed_data = self.archive_contents[position:position + compressed_data_size] if self.encrypted: compressed_data = self.decrypt_file(compressed_data) if compressed_data is None: # decrypt_file returns None on failure decompression_errors += 1 continue try: uncompressed_data = zlib.decompress(compressed_data) except zlib.error as e: decompression_errors += 1 logger.debug( f"[!] PYZ zlib decompression failed with error: {e}") else: pyc_file = self.output_dir / str(key + ".pyc") self.output_dir.mkdir(parents=True, exist_ok=True) with pyc_file.open("wb") as pyc_file_ptr: pyc_file_ptr.write(header_bytes + uncompressed_data) successfully_extracted += 1 if decompression_errors: logger.debug( f"[!] Failed to write {decompression_errors} files due to decompression errors." ) if successfully_extracted: logger.info( f"[+] Successfully extracted {successfully_extracted} files from this ZlibArchive." )
def check_and_fix_pyc( pyc_file: pathlib.Path, provided_version: str = None ) -> Union[None, tempfile.NamedTemporaryFile]: """Fix a given pyc file so it can be properly disassembled by xdis. This function combats the following common obfuscations that may be applied to pyc files that would prevent them from easily being disassembled 1. Missing the header entirely 2. Missing only the magic bytes 3. Magic bytes are there, but they don't match a known version 4. Filename doesn't end in .pyc Parameters ---------- pyc_file: pathlib.Path The path to the pyc file provided_version: str, optional The version of the Python that compiled the pyc, if known. Raises ------ RuntimeError The pyc file is malformed and couldn't be corrected, likely due to a version not being given. Returns ------- Union[None, tempfile.NamedTemporaryFile] If the pyc file is fine as is, this function returns None. If it needs to be fixed in some way, the temporary file object with the fixes is returned. """ corrected_file_contents: bytes = b"" all_bytes: bytes = b"" utils.check_read_access(pyc_file) infile: BinaryIO with pyc_file.open("rb") as infile: first_24_bytes: bytes = infile.read( min(24, pyc_file.stat().st_size)) infile.seek(0) all_bytes = infile.read() if not any(True for p in Pyc.MARSHALLED_CODE_OBJECT_LEADING_BYTES if p in first_24_bytes): raise RuntimeError(f"This file {str(pyc_file)} isn't pyc file!") if provided_version: correct_magic_num = bytecode.version_str_to_magic_num_int( provided_version) header = bytecode.create_pyc_header(correct_magic_num) if Pyc.is_headerless(first_24_bytes[:8]): # Is this pyc completely missing a header? if provided_version: corrected_file_contents = header corrected_file_contents += all_bytes else: logger.error( "[!] The pyc file provided does not have a header. For remap to decompile this, please provide a" " version with the --version flag") raise RuntimeError elif first_24_bytes[0:4] not in by_magic: # Does have a header of sorts, but can't recognize magic numbers. # We'll need a version from the user to proceed if not provided_version: logger.error( "[!] This version has a header, but we can't recognize the magic number" f" {struct.unpack('<H', first_24_bytes[0:2])[0]}. No version was provided to fix the header." ) raise RuntimeError else: logger.debug( "[*] This version has a header, but we can't recognize the magic number" f" {struct.unpack('<H', first_24_bytes[0:2])[0]}. Using magic num {correct_magic_num} (from" f" provided version {provided_version}) to fix the header." ) code_object_begin_index: int = -1 pattern: bytes for pattern in Pyc.MARSHALLED_CODE_OBJECT_LEADING_BYTES: if pattern in all_bytes: code_object_begin_index = all_bytes.index(pattern) break corrected_file_contents: bytes = header corrected_file_contents += all_bytes[code_object_begin_index:] bytes_to_write_out: bytes = b"" if corrected_file_contents: bytes_to_write_out = corrected_file_contents elif pyc_file.suffix != ".pyc": # There was nothing to correct except the filename, so we just duplicate the file. bytes_to_write_out = all_bytes else: # There was nothing to do with this pyc file. It is seemingly valid. return temp_file: tempfile.NamedTemporaryFile = tempfile.NamedTemporaryFile( suffix=".pyc") pyc_fixed_file: pathlib.Path = pathlib.Path(temp_file.name) outfile: BinaryIO with pyc_fixed_file.open("wb") as outfile: outfile.write(bytes_to_write_out) return temp_file
def _determine_python_version(self): """Will attempt to determine what version of python was used when this py2exe PE was compiled. We need to know this because xdis requires knowledge of the python version to unmarshal the bytecode correctly""" potential_magic_nums = set() logger.debug("[*] Attempting to discover version for PYTHONSCRIPT resource") # Method 1: Looking for PythonXY.DLL resource in the same directory as the PYTHONSCRIPT resource. If there, # check to see if it has a VERSIONINFO resource with a FileVersion or ProductVersion field, # as these typically contain the python version. See https://github.com/erocarrera/pefile for more info on # the structures used below if hasattr(self, "archive_path"): parent_dir = self.archive_path.parents[0] else: parent_dir = pathlib.Path.cwd() for python_dll in os.listdir(parent_dir): if re.match(r"python[0-9]{0,2}\.dll", python_dll, re.I): logger.debug(f"[*] Found python DLL resource {str(python_dll)} in directory {parent_dir}") try: dll_class_inst = PortableExecutable(parent_dir.joinpath(python_dll)) except TypeError: logger.debug(f"[!] PyDecipher could not create a PE/DLL class instance for {str(python_dll)}") else: dll_class_inst.load_version_info(quiet=True) if dll_class_inst.python_version: potential_magic_nums.add(version_str_to_magic_num_int(dll_class_inst.python_version)) finally: break # Method 2: Check to see if there are pyc files in the same directory with magic numbers for pyc_file in parent_dir.rglob("*.pyc"): with pyc_file.open("rb") as pyc_file_ptr: try: magic_bytes = pyc_file_ptr.read(4) magic_num = magic2int(magic_bytes) except: # TODO make more specific error catching pass else: potential_magic_nums.add(magic_num) break # Searching the PYTHONSCRIPT resource for strings like c:\python24\lib\site-packages\py2exe\boot_common.py b_python_regex = re.compile(b"(python)([0-9]{2})", re.I) script_re_obj = b_python_regex.search(self.resource_contents) if script_re_obj: version_str = script_re_obj.group(2).decode("utf-8") logger.info( "[*] Detected potential version string in PYTHONSCRIPT resource: {}".format( script_re_obj.group().decode("utf-8") ) ) potential_magic_nums.add(version_str_to_magic_num_int(version_str[0] + "." + version_str[1])) if potential_magic_nums: logger.info(f"[*] Will attempt to unmarshal using these python magic numbers: {potential_magic_nums}") return potential_magic_nums else: logger.info( "[!] Couldn't find any python magic numbers to hint at the python version of this resource. " "Will attempt to brute-force determine the correct magic number." ) return
def standard_pyc_remap( standard_bytecode_path: pathlib.Path, remapped_bytecode_path: pathlib.Path, version: str = None) -> Tuple[Dict[int, Dict[int, int]], str]: """Diff compiled code objects from standard library and modified interpreter to try and recreate opcode mappings. This method is similar to the megafile method, but at a larger scale. See the remap documentation for more information on this method. Parameters ---------- standard_bytecode_path: pathlib.Path The path on disk to the reference set of standard-compiled bytecode. The version of Python for the reference set must correspond to the version of Python used as a base for the modified interpreter. remapped_bytecode_path: pathlib.Path The path on disk to the set of bytecode compiled by the modified interpreter version: str, optional The version of Python that this opcode file corresponds to. Returns ------- Tuple[Dict[int, Dict[int, int]], str] A tuple containing a dictionary of original_opcode to Dict[replacement_opcode:replacement_count] and the opmap's Python version. replacement_opcode is an opcode that was seen in place of original_opcode, and the replacement_count is the amount of times it was seen replacing the original_opcode throughout all the bytecode that was analyzed. """ reference_files: Dict[str, List[pathlib.Path]] = {} determined_version: str = "" pyc_file: pathlib.Path for pyc_file in standard_bytecode_path.rglob("*.pyc"): pyc_file_name: str = pyc_file.name.split(".")[0] if pyc_file_name == "__init__": continue if not determined_version: try: infile: BinaryIO with pyc_file.open("rb") as infile: pyc_magic_bytes: bytes = infile.read(4) version_set: Set[str] = copy.deepcopy( xdis.magics.by_magic[pyc_magic_bytes]) determined_version = version_set.pop() except Exception: pass else: logger.debug( f"Determined version {determined_version} from reference bytecode." ) if version and bytecode.version_str_to_magic_num_int( determined_version ) != bytecode.version_str_to_magic_num_int(version): logger.warning( f"Provided version {version} does not equal the version determined in the reference pyc " f"set ({determined_version}). We will proceed with the version you provided." ) if pyc_file_name in reference_files: reference_files[pyc_file_name].append(pyc_file) else: reference_files[pyc_file_name] = [pyc_file] if not version: version = determined_version remapped_files: Dict[str, List[pathlib.Path]] = {} for pyc_file in remapped_bytecode_path.rglob("*"): if not pyc_file.is_file(): continue try: kwargs: Dict[str, str] = {"version_hint": version} artifact_types.pyc.Pyc(pyc_file, **kwargs) except TypeError: continue pyc_file_name: str = pyc_file.name.split(".")[0] if pyc_file_name == "__init__": # Too common a filename, causes more problems than its worth to try to include these # since they are usually empty anyway. continue if pyc_file_name in remapped_files: remapped_files[pyc_file_name].append(pyc_file) else: remapped_files[pyc_file_name] = [pyc_file] master_remapping_counts: Dict[int, Dict[int, int]] = {} pyc_filename: str list_of_filepaths: List[pathlib.Path] for pyc_filename, list_of_filepaths in remapped_files.items(): if pyc_filename not in reference_files: continue pyc_filepath: pathlib.Path for pyc_filepath in list_of_filepaths: reference_file: pathlib.Path = None highest_similarity: int = 0 ref_pyc_filepath: pathlib.Path for ref_pyc_filepath in reference_files[pyc_filename]: relative_reference_filepath: str = str( ref_pyc_filepath.relative_to(standard_bytecode_path)) relative_remapped_filepath: str = str( pyc_filepath.relative_to(remapped_bytecode_path)) path_similarity: float = textdistance.lcsstr.normalized_similarity( relative_reference_filepath, relative_remapped_filepath) if path_similarity > highest_similarity: highest_similarity = path_similarity reference_file = ref_pyc_filepath if not reference_file: continue fixed_pyc_file: tempfile.NamedTemporaryFile if fixed_pyc_file := artifact_types.pyc.Pyc.check_and_fix_pyc( pyc_filepath, provided_version=version): logger.debug( f"[+] Duplicated file {str(pyc_filepath)} to correct issues with the pyc. New filepath:" f" {fixed_pyc_file.name}") pyc_filepath = fixed_pyc_file.name try: remapped_filename: str remapped_co: CodeType # can also be xdis codetypes remapped_version: float remapped_timestamp: int remapped_magic_int: int remapped_is_pypy: bool remapped_source_size: int remapped_sip_hash: str ( remapped_filename, remapped_co, remapped_version, remapped_timestamp, remapped_magic_int, remapped_is_pypy, remapped_source_size, remapped_sip_hash, ) = xdis.disasm.disassemble_file(str(pyc_filepath), header=True, outstream=open( os.devnull, "w")) reference_filename: str reference_co: CodeType # can also be xdis codetypes reference_version: float reference_timestamp: int reference_magic_int: int reference_is_pypy: bool reference_source_size: int reference_sip_hash: str ( reference_filename, reference_co, reference_version, reference_timestamp, reference_magic_int, reference_is_pypy, reference_source_size, reference_sip_hash, ) = xdis.disasm.disassemble_file(str(reference_file), outstream=open( os.devnull, "w")) except Exception: continue version = str(reference_version) try: remappings: Dict[int, int] = bytecode.diff_opcode( reference_co, remapped_co, version) except RuntimeError: continue # merge these remappings into the larger dictionary. opcode_val: int remap_options: Dict[int, int] for opcode_val, remap_options in remappings.items(): if opcode_val in master_remapping_counts: remap_option: int count: int for remap_option, count in remap_options.items(): if remap_option in master_remapping_counts[opcode_val]: master_remapping_counts[opcode_val][ remap_option] += count else: master_remapping_counts[opcode_val][ remap_option] = count else: master_remapping_counts[opcode_val] = remap_options
def extract_files(self): magic_nums: set = set() decompression_errors = 0 successfully_extracted = 0 entry: CTOCEntry for entry in self.toc: data = self.archive_contents[entry. entry_offset:entry.entry_offset + entry.compressed_data_size] if entry.compression_flag: try: data = zlib.decompress(data) except zlib.error as e: decompression_errors += 1 logger.debug( f"[!] PyInstaller CArchive decompression failed with error: {e}" ) continue else: if len(data) != entry.uncompressed_data_size: logger.warning( f"[!] {entry.name} entry in CArchive listed its uncompressed data size as" f" {entry.uncompressed_data_size}, however in actuality, uncompressed to be {len(data)}" " bytes. This may be a sign that the CArchive was manually altered." ) if "\\" in entry.name: tmp: PureWindowsPath = pathlib.PureWindowsPath(entry.name) else: tmp: Path = Path(entry.name) file_path = pathlib.Path(self.output_dir).joinpath(tmp) if len(file_path.parents) > 1: # every path has '.' as a parent file_path.parent.mkdir(parents=True, exist_ok=True) if entry.type_code == self.ArchiveItem.PYSOURCE: if ord(data[:1]) == ord(xdis.marsh.TYPE_CODE) or ord( data[:1]) == (ord(xdis.marsh.TYPE_CODE) | xdis.unmarshal.FLAG_REF): file_path = file_path.parent / (file_path.name + ".pyc") if len(magic_nums) > 1: magic_num = next(iter(magic_nums)) logger.warning( "[!] More than one magic number found within this CArchive. Using magic number" f" {magic_num}, but also found numbers: {magic_nums}" ) elif len(magic_nums) == 0: logger.warning( f"[!] No magic numbers have been found yet, queueing this file for later." ) # TODO: add this file to a do-later list, when you know the magic num #TODO does this actually happen? dig deeper... pass data = pydecipher.bytecode.create_pyc_header( next(iter(magic_nums))) + data else: file_path = file_path.parent / (file_path.name + ".py") if "pyi" not in entry.name: logger.info( f"[!] Potential entrypoint found at script {entry.name}.py" ) elif entry.type_code == self.ArchiveItem.PYMODULE: magic_bytes = data[:4] # Python magic value magic_nums.add(magic2int(magic_bytes)) file_path = file_path.parent / (file_path.name + ".pyc") if entry.type_code != self.ArchiveItem.RUNTIME_OPTION: self.output_dir.mkdir(parents=True, exist_ok=True) with file_path.open(mode="wb") as f: f.write(data) successfully_extracted += 1 if entry.type_code in (self.ArchiveItem.PYZ, self.ArchiveItem.ZIPFILE): output_dir_name = (str( file_path.parent.joinpath( utils.slugify(file_path.name.split(".")[0]))) + "_output") pydecipher.unpack(file_path, output_dir=output_dir_name) if decompression_errors: logger.debug( f"[!] Failed to write {decompression_errors} files due to decompression errors." ) if successfully_extracted: logger.info( f"[+] Successfully extracted {successfully_extracted} files from this CArchive." )
def fill_opmap_gaps(remappings: Dict[int, int], version: str) -> Dict[int, Tuple[int, bool]]: """Fill the opmap with any missing opcodes for a specific version. Since pydecipher can only take in a valid opmap, we must make sure remap dumps opmaps that contain complete sets of opcodes. Very rarely will an opcode remapping method be able to cover 100% of opcodes in use for a particular Python version, so we need to fill the gaps with some guesses. Parameters ---------- remappings: Dict[int, int] A dictionary of original opcode to remapped opcode. version: str A version string `accepted by xdis`_. .. _accepted by xdis: https://github.com/rocky/python-xdis/blob/master/xdis/magics.py Returns ------- Dict[int, Tuple[int, bool]] A dictionary of original opcode to remapped opcode and a boolean indicating whether or not this remapping was guessed or observed. """ filled_remappings: Dict[int, Tuple[int, bool]] = { k: (v, False) for k, v in remappings.items() } is_pypy: bool = True if "pypy" in version else False try: opcode_obj: ModuleType = xdis.disasm.get_opcode(version, is_pypy) except KeyError: raise KeyError( f"[!] The version specified, {version}, is not supported by xdis.") xdis_opcode_map: Dict[str, int] = opcode_obj.opmap xdis_opcode_vals: Set[int] = set(xdis_opcode_map.values()) remaining_options: List[int] = list( xdis_opcode_vals.difference(set(remappings.values()))) logger.debug( f"[*] Set of opcodes available to assign from standard opmap: {remaining_options}" ) missing_opcodes = list(xdis_opcode_vals.difference(set(remappings.keys()))) logger.debug( f"[*] Set of opcodes that need an assignment in the modified opmap: {missing_opcodes}" ) missing_opcode: int for missing_opcode in missing_opcodes: smallest_distance: int = 999 best_option: int = -1 option: int for option in remaining_options: distance: int = abs(option - missing_opcode) if distance < smallest_distance: best_option = option smallest_distance = distance filled_remappings[missing_opcode] = (best_option, True) remaining_options.remove(best_option) return filled_remappings
def write_remapping_file( remappings: Dict[int, Tuple[int, bool]], version: str, method: str, cli: str, output_dir: Union[str, pathlib.Path] = ".", ) -> pathlib.Path: """Write the remappings dict to a JSON file that can be used by pydecipher. It is assumed that by this point `remappings` is a bijection of original opcodes and replacement opcodes. Parameters ---------- remappings: Dict[int, (int, bool)] A dictionary of original_opcode to (replacement_opcode, guess). replacement_opcode is the remapped value of original_opcode, and the guess boolean is whether or not remap actually observed this remapping or had to 'guess' it in order to produce a complete set of opcodes. version: str A version string `accepted by xdis`_. output_dir: Union[str, os.PathLike] The path where the remapping file should be written. method: str A text description of the remapping method used cli: str The command line for the remap command that produced this file. .. _accepted by xdis: https://github.com/rocky/python-xdis/blob/master/xdis/magics.py Returns ------- pathlib.Path The path to the remapping JSON file. """ output_dict: Dict[str:Union[int, Dict[int, int]]] = { "python_version": str(version), "remapped_opcodes": [], "method": method, "command_line": json.dumps(cli), } xdis_opcode: ModuleType = None try: xdis_opcode = xdis.disasm.get_opcode(version, is_pypy=False) except Exception: logger.debug( f"[!] Couldn't retrieve version {version} from xdis! Continuing anyway..." ) opcode_val: int remapping_dict: Dict[int, int] for opcode_val, remap_val in remappings.items(): output_subdict: Dict[str, int] = { "opcode": opcode_val, "remapped_value": remap_val[0], "guess": True if remap_val[1] else False, } if xdis_opcode: opname: str = xdis_opcode.opname[opcode_val] output_subdict["opname"] = opname.replace("+", "_") output_dict["remapped_opcodes"].append(output_subdict) # We sort based on the original opcode value because it seems like the most # natural way to sort this, and it is useful to have a standardized # output for comparison purposes. output_dict["remapped_opcodes"] = sorted(output_dict["remapped_opcodes"], key=lambda i: i["opcode"]) output_dir: pathlib.Path = pathlib.Path(output_dir).resolve() output_filepath: pathlib.Path = output_dir / "remapping.txt" if output_filepath.exists(): logger.debug( f"[!] {str(output_filepath)} already exists. Incrementing filename until an available name is found." ) counter: int = 1 while True: new_filepath: pathlib.Path = output_dir / f"remapping-{counter}.txt" if not new_filepath.exists(): break counter += 1 output_filepath = new_filepath output_dir.mkdir(parents=True, exist_ok=True) with output_filepath.open("w") as output_file_ptr: output_file_ptr.write(json.dumps(output_dict, sort_keys=True, indent=4)) logger.info(f"[+] {str(output_filepath)} successfully written") return output_filepath
remapped_source_size: int remapped_sip_hash: str ( remapped_filename, remapped_co, remapped_version, remapped_timestamp, remapped_magic_int, remapped_is_pypy, remapped_source_size, remapped_sip_hash, ) = xdis.disasm.disassemble_file(str(remapped_bytecode_path), outstream=open(os.devnull, "w")) except Exception as e: e: Exception logger.debug(f"Error disassembling remap megafile: {e}") logger.debug( "It is possible that this custom interpreter has tampered with the Python code compilation process in such" " a way that xdis cannot disassemble it. You can try manually inspecting the file to learn more." ) raise RuntimeError remappings: Dict[int, Dict[int, int]] = bytecode.diff_opcode(reference_co, remapped_co, str(reference_version)) return remappings, str(reference_version) def opcode_constants_remap(
def validate_opmap(version: str, opmap: Dict[str, int]) -> bool: """Validate whether opmap is correct/well-formed for the given version. A well-formed opcode map should not have any duplicate keys or values, nor any missing or extraneous opnames or opcodes. Parameters ---------- version : str Typically a string like '2.7' or '3.8.1'. However, the version string can be `any version accepted by xdis`_, including some weird alternate Python implementations like 2.7.1b3Jython or 3.5pypy. .. _any version accepted by xdis: https://github.com/rocky/python-xdis/blob/master/xdis/magics.py opmap : Dict[str, int] A dictionary of OPERATION NAME: OPCODE VALUE. Returns ------- bool Whether or not this opcode map is valid and well-formed. """ is_pypy: bool = True if "pypy" in version else False try: opcode_obj: ModuleType = xdis.main.get_opcode(version, is_pypy) except KeyError: raise KeyError( f"[!] The version specified, {version}, is not supported by xdis.") xdis_opcode_map: Dict[str, int] = opcode_obj.opmap validity: bool = True opname: str opcode: int for opname, opcode in opmap.items(): if opname not in xdis_opcode_map.keys(): logger.debug( f"[!] This opcode map contains the opname {opname}, which doesn't appear to be a valid " f"operation for Python {version}.") validity = False if list(opmap.keys()).count(opname) > 1: logger.debug( f"[!] This opcode map contains {list(opmap.keys()).count(opname)} entries for the opname {opname}." ) validity = False if list(opmap.values()).count(opcode) > 1: logger.debug( f"[!] This opcode map contains {list(opmap.values()).count(opcode)} entries for the opcode {opcode}." ) validity = False for opname, opcode in xdis_opcode_map.items(): if opname not in opmap.keys(): logger.debug( f"[!] This opcode map does not have an entry for the opname {opname}. In standard Python " f"{version}, this value is {opcode}.") validity = False if len(opmap.keys()) != len(xdis_opcode_map.keys()): logger.debug( f"[!] This opcode map has a size of {len(opmap.keys())}, when it should have a size of " f"{len(xdis_opcode_map.keys())} for Python version {version}.") validity = False return validity
def decompile_pyc( arg_tuple: Tuple[pathlib.Path, Dict[str, int], Dict[str, Union[bool, os.PathLike]]] ) -> str: """Decompile a single Python bytecode file. Parameters ---------- arg_tuple: Tuple[pathlib.Path, Dict[str, int], Dict[str, Union[bool, os.PathLike]]] A tuple containing the arguments for this function. This is a tuple because pebble's Pool.map() function couldn't pass multiple arguments to a subprocessed function call. The tuple entries correspond to the following arguments: pyc_file : pathlib.Path The path to the compiled Python file alternate_opmap : Dict[str, int], optional If this bytecode file was produced by an interpreter with remapped opcodes, you must provide the opmap as a OPNAME: OPCODE dictionary logging_options: Dict[str, Union[bool, os.PathLike], optional A dictionary of logging options. This is only needed when pydecipher is performing multi-processed decompilation. The keys can be the following strings: verbose: bool True will enable verbose logging. quiet: bool True will silence all console logging. log_path: pathlib.Path If a path object is passed in as the log_path, the running instance of pydecipher will continue logging to that file. Returns ------- str There are several different return values: * **no_action**: This file was not decompiled. * **success**: This file was successfully decompiled. * **error**: This file could not be decompiled 100% successfully. * **opcode_error**: The error message returned by uncompyle6 indicates this file may have remapped opcodes """ pyc_file: pathlib.Path = arg_tuple[0] alternate_opmap: Dict[str, int] = arg_tuple[1] or None logging_options: Dict[str, Union[bool, os.PathLike]] = arg_tuple[2] or None if not pyc_file.is_file(): return "no_action" # Because this function runs in a new pydecipher process entirely, logging # options set during runtime (from command-line flags) do not carry over # automatically. We must pass these through manually, and reset the options # for this specific process. if logging_options and not pydecipher.log_path: pydecipher.set_logging_options(**logging_options) hijacked_stdout: io.StringIO = io.StringIO() hijacked_stderr: io.StringIO = io.StringIO() with redirect_stdout(hijacked_stdout), redirect_stderr(hijacked_stderr): # Chop off c in pyc new_file_name: pathlib.Path = pathlib.Path( str(pyc_file.resolve())[:-1]) # This prohibits the overwriting of existing files. # if new_file_name.exists() and new_file_name.stat().st_size: # return "no_action" logger.debug( f"[*] Decompiling file {pyc_file} of size {pyc_file.stat().st_size}" ) if not alternate_opmap: try: uncompyle6.decompile_file(str(pyc_file), outstream=sys.stdout) except uncompyle6.semantics.parser_error.ParserError as e: logger.warning(f"[!] Failed to decompile file {pyc_file}") if REMAPPED_OPCODE_ERROR_REGEX.match(str(e.error)): logger.error( f"[!] {pyc_file.name} failed to decompile with an error that indicate its opcode " "mappings may have been remapped to prevent analysis.") return "opcode_error" return "error" except Exception as e: e: Exception logger.error( f"[!] Failed to decompile file {pyc_file} with error: {e}") stdout_val: str = hijacked_stdout.getvalue() if stdout_val: with new_file_name.open("w") as file_ptr: file_ptr.write(stdout_val) return "error" else: with new_file_name.open("w") as file_ptr: file_ptr.write(hijacked_stdout.getvalue()) logger.info(f"[+] Successfully decompiled {pyc_file}") return "success" else: filename: str co: CodeType # can also be xdis.Code* objects version: float timestamp: int # seconds since epoch magic_int: int is_pypy: bool source_size: int sip_hash: str try: ( filename, co, version, timestamp, magic_int, is_pypy, source_size, sip_hash, ) = xdis.main.disassemble_file(str(pyc_file), outstream=open(os.devnull, "w"), alternate_opmap=alternate_opmap) output_file: TextIO with new_file_name.open(mode="w") as output_file: uncompyle6.main.decompile( version, co, timestamp=timestamp, source_size=source_size, magic_int=magic_int, is_pypy=is_pypy, out=output_file, ) except Exception as e: e: Exception logger.info( f"[!] Failed to decompile file {pyc_file} with error: {e}") return "error" else: logger.info(f"[+] Successfully decompiled {pyc_file}") return "success"
def process_pycs(pyc_iterable: Iterable[os.PathLike], alternate_opmap: Dict[str, int] = None) -> None: """Multi-processed decompilation orchestration of compiled Python files. Currently, pydecipher uses `uncompyle6`_ as its decompiler. It works well with `xdis`_ (same author) and allows for the decompilation of Code objects using alternate opmaps (with our extension of xdis). This function will start up CPU count * 2 pydecipher processes to decompile the given Python. Attempts to check for debugger, in which case the decompilation will be single-threaded to make debugging easier. .. _uncompyle6: https://github.com/rocky/python-uncompyle6/ .. _xdis: https://github.com/rocky/python-xdis Parameters ---------- pyc_iterable : Iterable[os.PathLike] An iterable of pathlib.Path objects, referencing compiled Python files to decompile. alternate_opmap : Dict[str, int], optional An opcode map of OPNAME: OPCODE (i.e. 'POP_TOP': 1). This should be a complete opmap for the Python version of the files being decompiled. Even if only two opcodes were swapped, the opcode map passed in should contain all 100+ Python bytecode operations. """ # This checks if the PyCharm debugger is attached. if sys.gettrace(): # Single-threaded for easier debugging. logger.debug( "[!] Debugger detected, not using multiprocessing for decompilation of pyc files." ) return_status_codes: List[str] = [] pyc_file: pathlib.Path for pyc_file in pyc_iterable: return_status_codes.append( decompile_pyc((pyc_file, alternate_opmap, pydecipher.get_logging_options()))) else: return_status_codes: List[str] = [] pool: pebble.ProcessPool with pebble.ProcessPool(os.cpu_count() * 2) as pool: iterables = [(pyc, alternate_opmap, pydecipher.get_logging_options()) for pyc in pyc_iterable] future: pebble.ProcessMapFuture = pool.map(decompile_pyc, iterables, timeout=300) iterator: Iterable = future.result() index: int = 0 while True: try: result: Any = next(iterator) return_status_codes.append(result) except StopIteration: break except TimeoutError as e: e: TimeoutError failed_pyc_path: str = str(iterables[index][0]) logger.error( f"[!] Timed out ({e.args[1]}s) trying to decompile {failed_pyc_path}." ) return_status_codes.append("error") except pebble.ProcessExpired as e: e: pebble.ProcessExpired logger.error( f"[!] Failed to decompile {failed_pyc_path} (process expired with status code {e.exitcode}." ) return_status_codes.append("error") except Exception as e: e: Exception logger.error( f"[!] Failed to decompile {failed_pyc_path} with unknown error: {e}" ) return_status_codes.append("error") finally: index += 1 successes: int = return_status_codes.count("success") opcode_errors: int = return_status_codes.count("opcode_error") errors: int = return_status_codes.count("error") + opcode_errors if opcode_errors: logger.warning( f"[!] {opcode_errors} file(s) failed to decompile with an error " "that indicate its opcode mappings may have been remapped. Try using" "`remap` on this set of bytecode.") if successes and not errors: logger.info(f"[+] Successfully decompiled {successes} .pyc files.") elif successes and errors: logger.warning( f"[!] Successfully decompiled {successes} .pyc files. Failed to decompile {errors} files. " "See log for more information.") elif not successes and errors: logger.error( f"[!] Failed to decompile all {errors} .pyc files. See log for more information." ) else: logger.warning( "[!] No pyc files were decompiled. See log for more information.")
def parse_toc(self): # Read CArchive cookie if self.pyinstaller_version == 2.0 or self.pyinstaller_version == "unknown": try: ( magic, self.length_of_package, self.toc_offset, self.toc_size, self.python_version, ) = struct.unpack( "!8siiii", self.archive_contents[self.magic_index:self.magic_index + self.PYINST20_COOKIE_SIZE], ) except: pass else: self.pyinstaller_version = 2.0 if self.pyinstaller_version == 2.1 or self.pyinstaller_version == "unknown": try: ( magic, self.length_of_package, self.toc_offset, self.toc_size, self.python_version, self.python_dynamic_lib, ) = struct.unpack( "!8siiii64s", self.archive_contents[self.magic_index:self.magic_index + self.PYINST21_COOKIE_SIZE], ) except: pass else: self.pyinstaller_version = 2.1 if self.python_dynamic_lib: self.python_dynamic_lib = self.python_dynamic_lib.decode( "ascii").rstrip("\x00") if self.pyinstaller_version == "unknown": logger.warning( "[!] Could not parse CArchive because PyInstaller version is unknown." ) return self.python_version = float(self.python_version) / 10 logger.info( f"[*] This CArchive was built with Python {self.python_version}") logger.debug(f"[*] CArchive Package Size: {self.length_of_package}") logger.debug(f"[*] CArchive Python Version: {self.python_version}") if self.pyinstaller_version == 2.1: logger.debug( f"[*] CArchive Python Dynamic Library Name: {self.python_dynamic_lib}" ) self.toc = [] toc_bytes = self.archive_contents[self.toc_offset:self.toc_offset + self.toc_size] while toc_bytes: (entry_size, ) = struct.unpack("!i", toc_bytes[0:4]) name_length = entry_size - self.CTOCEntry.ENTRYLEN ( entry_offset, compressed_data_size, uncompressed_data_size, compression_flag, type_code, name, ) = struct.unpack(f"!iiiBB{name_length}s", toc_bytes[4:entry_size]) name = name.decode("utf-8").rstrip("\0") if name == "": name = str(uniquename()) logger.debug( f"[!] Warning: Found an unnamed file in CArchive. Using random name {name}" ) type_code = chr(type_code) self.toc.append( self.CTOCEntry( entry_offset, compressed_data_size, uncompressed_data_size, compression_flag, type_code, name, )) toc_bytes = toc_bytes[entry_size:] logger.debug( f"[*] Found {len(self.toc)} entries in this PyInstaller CArchive")
def dump_certificates(self, output_dir: pathlib.Path = None) -> None: """Dump Authenticode certificates from the PE's certificate attribute table. Parameters ---------- output_dir: pathlib.Path, optional An optional alternative output directory to dump the certificates, besides the class's output directory. """ certificate_table_entry: pefile.Structure = None if hasattr(self.pe, "OPTIONAL_HEADER") and hasattr( self.pe.OPTIONAL_HEADER, "DATA_DIRECTORY"): idx: int for idx in range(len(self.pe.OPTIONAL_HEADER.DATA_DIRECTORY)): directory: pefile.Structure = self.pe.OPTIONAL_HEADER.DATA_DIRECTORY[ idx] if directory.name == "IMAGE_DIRECTORY_ENTRY_SECURITY" and directory.Size: logger.debug("[*] This PE has a certificate table.") certificate_table_entry = directory break if certificate_table_entry is None: return if output_dir is None: certificate_extraction_dir: pathlib.Path = self.output_dir.joinpath( "Authenticode_Certificates") else: certificate_extraction_dir: pathlib.Path = output_dir certificate_extraction_dir.mkdir(parents=True, exist_ok=True) certificate_table_data: bytes = self.pe.__data__[ certificate_table_entry.VirtualAddress:] while certificate_table_data: # https://docs.microsoft.com/en-us/windows/desktop/Debug/pe-format#the-attribute-certificate-table-image-only cert_length: int = int.from_bytes(certificate_table_data[0:4], byteorder="little") cert_version: bytes = certificate_table_data[4:6] # noqa cert_type = certificate_table_data[6:8] # noqa cert: bytes = certificate_table_data[8:8 + cert_length] certificate_table_data: bytes = certificate_table_data[ 8 + cert_length:] # Extract all the X509 certificates from the PKCS#7 structure authenticode_structure: signify.authenticode.AuthenticodeSignedData = AuthenticodeSignedData.from_envelope( cert) cert_obj: signify.certificates.Certificate for cert_obj in authenticode_structure.certificates: cert_name_obj: asn1crypto.x509.Name = cert_obj.to_asn1crypto.subject preferred_name_fields: List[str] = [ "organizational_unit_name", "organization_name", "common_name", ] name_selected: bool = False preferred_field_name: str for preferred_field_name in preferred_name_fields: name_tuple: Tuple[str, str] for name_tuple in cert_name_obj.native.items(): field: str = name_tuple[0] value: str = name_tuple[1] if field == preferred_field_name: name_selected = True cert_name: str = value break if name_selected: break if not name_selected: cert_name: str = f"{len(os.listdir(certificate_extraction_dir))}" cert_name: str = utils.slugify(cert_name, allow_unicode=True) + ".pem" logger.debug( f"[+] Extracting Authenticode certificate {cert_name}.") f: BinaryIO with certificate_extraction_dir.joinpath(cert_name).open( "wb") as f: der_bytes: bytes = cert_obj.to_asn1crypto.dump() pem_bytes: bytes = pem.armor("CERTIFICATE", der_bytes) f.write(pem_bytes) self.certificates_dumped = True