def extract_macros_from_office2003(fullpath, fileobj=None): ''' :return: [(host_fullpath, filename_from_host, data), ... ] ''' from oletools.olevba import VBA_Parser vp = VBA_Parser(fullpath, data=fileobj.read() if fileobj else None) r = [] try: if vp.detect_vba_macros(): macros = vp.extract_all_macros() assert (macros ) # macros detect, if cannot extact, must be error occured if macros: for (subfullpath, stream_path, vba_filename, vba_code) in macros: a = os.path.basename(fullpath) b = os.path.basename(subfullpath) vba_filename += u'.vba' sub = (io_text_arg(fullpath), io_text_arg(vba_filename if a == b else u'{0}_{1}'. format(b, vba_filename)), vba_code) r.append(sub) except: pass finally: vp.close() return r
def extract_office2003_from_unknown_office(fullpath, fileobj=None): ''' 从不明 office(可能是 office2003, office2007) 中解出内嵌的 office2003 :return: [ (host_fullpath,filename_from_host,<file_open_handler>), ] ''' import zipfile import olefile import io r = [] if olefile.isOleFile(fileobj if fileobj else fullpath): r.append((fullpath, os.path.basename(fullpath), fileobj if fileobj else open(fullpath, 'rb'))) elif zipfile.is_zipfile(fileobj if fileobj else fullpath): with zipfile.ZipFile(fileobj if fileobj else fullpath) as z: for subfile in z.namelist(): with z.open(subfile) as zt: magic = zt.read(len(olefile.MAGIC)) if magic == olefile.MAGIC: r.append((fullpath, io_text_arg(subfile), io.BytesIO(z.open(subfile).read()))) else: raise ValueError(u'not office file') return r
def extract_attachment_from_msg(fullpath): ''' :return: [(host_fullpath, filename_from_host, file_content)] ''' from ExtractMsg import Message msg = Message(fullpath) r = [] for attachment in msg.attachments: name = attachment.longFilename # name = u'{0}_{1}'.format(fullpath, name) r.append((fullpath, io_text_arg(name), attachment.data)) return r
def _extract_attachment_from_attachment(attachment, depth, results): ''' call by others, and also call by self :return: ''' from base64_to_office import decode_mso_to_office, is_mso_buffer fn = attachment.get_filename() fn = io_text_arg(fn) if fn is None: v = attachment.get(u'Content-Location', None) if v: fn = os.path.split(v)[-1] if not fn: fn = u'noname.emb' fn = u'{0:0<3}.{1}'.format(depth, fn) if attachment.is_multipart(): payloads = attachment.get_payload(decode=False) depth *= 10 for e in payloads: depth += 1 _extract_attachment_from_attachment(e, depth, results) else: data = attachment.get_payload(decode=True) if is_mso_buffer(data): ole = decode_mso_to_office(data) if ole: fn = fn + u'.office' results.append((fn, ole)) else: results.append((fn, data)) else: # results.append((fn, attachment.get_payload(decode=False))) results.append((fn, data))
import argparse import os import sys curpath = os.path.dirname(os.path.realpath(__file__)) sys.path.append(os.path.abspath(os.path.join(curpath, '../'))) from io_in_out import io_text_arg from io_in_out import io_is_path_valid from io_in_out import io_sys_stdout from io_in_out import io_hash_stream from io_in_out import io_hash_memory from io_in_out import io_print from io_in_out import io_files_from_arg from io_in_out import io_path_format curpath = io_text_arg(curpath) def dump_sub_file(host_fullpath, filename_from_host, data_or_fileobj_to_write): ''' 从文件中内嵌出来的文件可能文件名是无效的,无法创建文件,这个函数来规范化文件名 :return: the final sub file fullpath ''' import random import shutil # must detect path sep first _func_replace_os_path_sep = lambda x: x.replace(u'/', u'_').replace( u'\\', u'_')
def escape_office_10native_from_buffer(stream_buffer): ''' :return: None / ('','','','') 解出 ole 中的 pe 文件 ref https://raw.githubusercontent.com/unixfreak0037/officeparser/master/officeparser.py 上面的有错误, 利用下面微软的文章修正 ref https://code.msdn.microsoft.com/office/CSOfficeDocumentFileExtract-e5afce86 ''' size = struct.unpack('<L', stream_buffer[0:4])[0] data = stream_buffer[4:] unknown_short = None filename = [] src_path = [] dst_path = [] actual_size = None unknown_long_1 = None unknown_long_2 = None # I thought this might be an OLE type specifier ??? unknown_short = struct.unpack('<H', data[0:2])[0] data = data[2:] # filename i = 0 while i < len(data): if ord(data[i]) == 0: break filename.append(data[i]) i += 1 filename = ''.join(filename) data = data[i + 1:] # source path i = 0 while i < len(data): if ord(data[i]) == 0: break src_path.append(data[i]) i += 1 src_path = ''.join(src_path) data = data[i + 1:] # TODO I bet these next 8 bytes are a timestamp unknown_long_1 = struct.unpack('<L', data[0:4])[0] data = data[4:] # Next four bytes gives the size of the temporary path of the embedded file in little endian format # This should be converted temp_path_size = struct.unpack('<L', data[0:4])[0] data = data[4:] # destination path? (interesting that it has my name in there) i = 0 while i < len(data): if ord(data[i]) == 0: break dst_path.append(data[i]) i += 1 dst_path = ''.join(dst_path) # 修正第一个 ref 文章的 bug if len(dst_path) > temp_path_size: raise ValueError(u'stream decode error, len(dst_path)>temp_path_size ') data = data[temp_path_size:] # size of the rest of the data actual_size = struct.unpack('<L', data[0:4])[0] if not actual_size: return None data = data[4:] # (filename, <fullpath before put in ole>,<fullpath to write from ole>,data) filename = io_text_arg(filename) fullpath_original = io_text_arg(src_path) fullpath_dst = io_text_arg(dst_path) return (filename, fullpath_original, fullpath_dst, data[0:actual_size])