def scan(self, data, file, options, expire_at): self.event['total'] = {'attachments': 0, 'extracted': 0} self.event.setdefault('object_names', []) tnef = tnefparse.TNEF(data) tnef_objects = getattr(tnef, 'objects', []) for tnef_object in tnef_objects: descriptive_name = tnefparse.TNEF.codes.get(tnef_object.name) if descriptive_name not in self.event['object_names']: self.event['object_names'].append(descriptive_name) try: object_data = tnef_object.data.strip(b'\0') or None except: object_data = tnef_object.data if object_data is not None: if descriptive_name == 'Subject': self.event['subject'] = object_data elif descriptive_name == 'Message ID': self.event['message_id'] = object_data elif descriptive_name == 'Message Class': self.event['message_class'] = object_data tnef_attachments = getattr(tnef, 'attachments', []) self.event['total']['attachments'] = len(tnef_attachments) for attachment in tnef_attachments: extract_file = strelka.File( name=attachment.name.decode(), source=self.name, ) for c in strelka.chunk_string(attachment.data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 tnef_html = getattr(tnef, 'htmlbody', None) if tnef_html is not None: extract_file = strelka.File( name='htmlbody', source=self.name, ) for c in strelka.chunk_string(tnef_html): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'objects': 0, 'extracted': 0} rtf = rtfobj.RtfObjParser(data) rtf.parse() self.event['total']['objects'] = len(rtf.objects) for object in rtf.objects: if self.event['total']['extracted'] >= file_limit: break index = rtf.server.index(object) if object.is_package: extract_file = strelka.File( name=object.filename, source=self.name, ) for c in strelka.chunk_string(object.olepkgdata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) elif object.is_ole: extract_file = strelka.File( name=f'object_{index}', source=self.name, ) for c in strelka.chunk_string(object.oledata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) else: extract_file = strelka.File( name=f'object_{index}', source=self.name, ) for c in strelka.chunk_string(object.rawdata): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1
def scan(self, data, file, options, expire_at): self.event['total'] = {'streams': 0, 'extracted': 0} try: ole = olefile.OleFileIO(data) ole_streams = ole.listdir(streams=True) self.event['total']['streams'] = len(ole_streams) for stream in ole_streams: file = ole.openstream(stream) extract_data = file.read() extract_name = f'{"_".join(stream)}' extract_name = re.sub(r'[\x00-\x1F]', '', extract_name) if extract_name.endswith('Ole10Native'): native_stream = oletools.oleobj.OleNativeStream( bindata=extract_data, ) if native_stream.filename: extract_name = extract_name + f'_{str(native_stream.filename)}' else: extract_name = extract_name + '_native_data' extract_file = strelka.File( name=extract_name, source=self.name, ) for c in strelka.chunk_string(native_stream.data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) else: extract_file = strelka.File( name=extract_name, source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except OSError: self.flags.append('os_error') finally: ole.close()
def scan(self, data, file, options, expire_at): jtr_path = options.get('jtr_path', '/jtr/') tmp_directory = options.get('tmp_file_directory', '/tmp/') file_limit = options.get('limit', 1000) password_file = options.get('password_file', '/etc/strelka/passwords.dat') log_extracted_pws = options.get('log_pws', False) scanner_timeout = options.get('scanner_timeout', 150) brute = options.get('brute_force', False) max_length = options.get('max_length', 5) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as zip_io: try: with zipfile.ZipFile(zip_io) as zip_obj: name_list = zip_obj.namelist() self.event['total']['files'] = len(name_list) extracted_pw = crack_zip(self, data, jtr_path, tmp_directory, brute=brute, scanner_timeout=scanner_timeout, max_length=max_length, password_file=password_file) if not extracted_pw: self.flags.append('Could not extract password') return if log_extracted_pws: self.event['cracked_password'] = extracted_pw for i, name in enumerate(name_list): if not name.endswith('/'): if self.event['total']['extracted'] >= file_limit: break try: extract_data = zip_obj.read(name, extracted_pw) if extract_data: extract_file = strelka.File( name=name, source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except NotImplementedError: self.flags.append('unsupported_compression') except RuntimeError: self.flags.append('runtime_error') except ValueError: self.flags.append('value_error') except zlib.error: self.flags.append('zlib_error') except zipfile.BadZipFile: self.flags.append('bad_zip')
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() (stdout, stderr) = subprocess.Popen( ['antiword', tmp_data.name], stdout=subprocess.PIPE, stderr=subprocess.DEVNULL ).communicate() if stdout: extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(stdout): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as encoded_file: extract_data = b'' try: extract_data = base64.b64decode(encoded_file.read()) self.event['decoded_header'] = extract_data[:50] except binascii.Error: self.flags.append('not_decodable_from_base64') if extract_data: extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() upx_return = subprocess.call( ['upx', '-d', tmp_data.name, '-o', f'{tmp_data.name}_upx'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) if upx_return == 0: with open(f'{tmp_data.name}_upx', 'rb') as upx_fin: upx_file = upx_fin.read() upx_size = len(upx_file) if upx_size > len(data): extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(upx_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) os.remove(f'{tmp_data.name}_upx') else: self.flags.append(f'return_code_{upx_return}')
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} try: with libarchive.memory_reader(data) as archive: for entry in archive: self.event['total']['files'] += 1 if entry.isfile: if self.event['total']['extracted'] >= file_limit: continue extract_file = strelka.File( name=entry.pathname, source=self.name, ) for block in entry.get_blocks(): self.upload_to_cache( extract_file.pointer, block, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except libarchive.ArchiveError: self.flags.append('libarchive_archive_error')
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as rar_io: with rarfile.RarFile(rar_io) as rar_obj: rf_info_list = rar_obj.infolist() self.event['total']['files'] = len(rf_info_list) for rf_object in rf_info_list: if not rf_object.isdir(): if self.event['total']['extracted'] >= file_limit: break file_info = rar_obj.getinfo(rf_object) if not file_info.needs_password(): self.event['host_os'] = HOST_OS_MAPPING[file_info.host_os] extract_file = strelka.File( name=f'{file_info.filename}', source=self.name, ) for c in strelka.chunk_string(rar_obj.read(rf_object)): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 else: self.flags.append('password_protected')
def scan(self, data, file, options, expire_at): try: with io.BytesIO(data) as lzma_io: with lzma.LZMAFile(filename=lzma_io) as lzma_obj: try: decompressed_file = lzma_obj.read() decompressed_size = len(decompressed_file) self.event['decompressed_size'] = decompressed_size extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except EOFError: self.flags.append('eof_error') except lzma.LZMAError: self.flags.append('lzma_error')
def scan(self, data, file, options, expire_at): tmp_directory = options.get('tmp_directory', '/tmp/') self.event['total'] = {'certificates': 0, 'extracted': 0} with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() if data[:1] == b'0': pkcs7 = SMIME.load_pkcs7_der(tmp_data.name) else: pkcs7 = SMIME.load_pkcs7(tmp_data.name) certs = pkcs7.get0_signers(X509.X509_Stack()) if certs: self.event['total']['certificates'] = len(certs) for cert in certs: extract_file = strelka.File( name=f'sn_{cert.get_serial_number()}', source=self.name, ) for c in strelka.chunk_string(cert.as_der()): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as swf_io: swf_io.seek(4) swf_size = struct.unpack('<i', swf_io.read(4))[0] swf_io.seek(0) magic = swf_io.read(3) extract_data = b'FWS' + swf_io.read(5) if magic == b'CWS': self.event['type'] = 'CWS' try: extract_data += zlib.decompress(swf_io.read())[:swf_size - 8] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except zlib.error: self.flags.append('zlib_error') elif magic == b'ZWS': self.event['type'] = 'ZWS' swf_io.seek(12) extract_data += pylzma.decompress(swf_io.read())[:swf_size - 8] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) elif magic == b'FWS': self.event['type'] = 'FWS'
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as zip_io: try: with zipfile.ZipFile(zip_io) as zip_obj: name_list = zip_obj.namelist() self.event['total']['files'] = len(name_list) for i, name in enumerate(name_list): if not name.endswith('/'): if self.event['total']['extracted'] >= file_limit: break try: extract_data = b'' zinfo = zip_obj.getinfo(name) if zinfo.flag_bits & 0x1: if i == 0: self.flags.append('encrypted') else: extract_data = zip_obj.read(name) if extract_data: extract_file = strelka.File( name=name, source=self.name, ) for c in strelka.chunk_string( extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except NotImplementedError: self.flags.append('unsupported_compression') except RuntimeError: self.flags.append('runtime_error') except ValueError: self.flags.append('value_error') except zlib.error: self.flags.append('zlib_error') except zipfile.BadZipFile: self.flags.append('bad_zip')
def scan(self, data, file, options, expire_at): analyze_macros = options.get('analyze_macros', True) self.event['total'] = {'files': 0, 'extracted': 0} try: vba = olevba3.VBA_Parser(filename=file.name, data=data) if vba.detect_vba_macros(): extract_macros = list(vba.extract_macros()) self.event['total']['files'] = len(extract_macros) for (filename, stream_path, vba_filename, vba_code) in extract_macros: extract_file = strelka.File( name=f'{vba_filename}', source=self.name, ) for c in strelka.chunk_string(vba_code): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 if analyze_macros: self.event.setdefault('auto_exec', []) self.event.setdefault('base64', []) self.event.setdefault('dridex', []) self.event.setdefault('hex', []) self.event.setdefault('ioc', []) self.event.setdefault('suspicious', []) macros = vba.analyze_macros() for (macro_type, keyword, description) in macros: if macro_type == 'AutoExec': self.event['auto_exec'].append(keyword) elif macro_type == 'Base64 String': self.event['base64'].append(keyword) elif macro_type == 'Dridex String': self.event['dridex'].append(keyword) elif macro_type == 'Hex String': self.event['hex'].append(keyword) elif macro_type == 'IOC': self.event['ioc'].append(keyword) elif macro_type == 'Suspicious': self.event['suspicious'].append(keyword) except olevba3.FileOpenError: self.flags.append('file_open_error') finally: # TODO referenced before potential assignment as vba is opened in a try / catch block vba.close()
def scan(self, data, file, options, expire_at): password_file = options.get('password_file', '/etc/strelka/passwords.dat') if not self.passwords: if os.path.isfile(password_file): with open(password_file, 'rb') as f: for line in f: self.passwords.append(line.strip()) with io.BytesIO(data) as doc_io: msoff_doc = msoffcrypto.OfficeFile(doc_io) output_doc = io.BytesIO() password = '' extract_data = b'' if msoff_doc.is_encrypted(): self.flags.append('password_protected') for pw in self.passwords: if not password: try: msoff_doc.load_key(password=pw.decode('utf-8')) output_doc.seek(0) msoff_doc.decrypt(output_doc) output_doc.seek(0) if output_doc.readable(): extract_data = output_doc.read() password = pw.decode('utf-8') break except Exception: pass if password: self.event['password'] = password extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) else: self.flags.append('no_password_match_found')
def scan(self, data, file, options, expire_at): decoded = base64.b64decode(data) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decoded): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): headers = options.get('headers', []) self.event['total'] = {'parts': 0, 'extracted': 0} try: message = email.message_from_string(data.decode( 'UTF-8', 'replace')) self.event['headers'] = [] for h, v in message.items(): if headers and h not in headers: continue self.event['headers'].append({ 'header': h, 'value': v, }) self.event['parts'] = [] for (index, part) in enumerate(message.walk()): self.event['total']['parts'] += 1 extract_data = part.get_payload(decode=True) if extract_data is not None: part_filename = part.get_filename() if part_filename is not None: extract_name = f'{part_filename}' self.event['parts'].append(part_filename) else: extract_name = f'part_{index}' extract_file = strelka.File( name=extract_name, source=self.name, ) extract_file.add_flavors( {'external': [part.get_content_type()]}) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except AssertionError: self.flags.append('assertion_error')
def _recurse_node(self, node, xml_args): """Recursively parses XML file. The XML file is recursively parsed down every node tree. Args: node: node to be recursively parsed. xml_args: options set by the scanner that affect XMl parsing. """ if node is not None: if hasattr(node.tag, '__getitem__'): if node.tag.startswith('{'): namespace, separator, tag = node.tag[1:].partition('}') else: namespace = None tag = node.tag self.event['total']['tags'] += 1 if namespace not in self.event['namespaces']: self.event['namespaces'].append(namespace) if tag not in self.event['tags']: self.event['tags'].append(tag) text = node.attrib.get('name', node.text) if text is not None: if tag in xml_args['metadata_tags']: tag_data = {'tag': tag, 'text': text.strip()} if tag_data not in self.event['tag_data']: self.event['tag_data'].append(tag_data) elif tag in xml_args['extract_tags']: extract_file = strelka.File( name=tag, source=self.name, ) for c in strelka.chunk_string(text): self.upload_to_coordinator( extract_file.pointer, c, self.expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 for child in node.getchildren(): self._recurse_node(self, child, xml_args) return
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as gzip_io: with gzip.GzipFile(fileobj=gzip_io) as gzip_obj: decompressed = gzip_obj.read() self.event['size'] = len(decompressed) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): ans = False image = np.fromstring(data, np.uint8) image = cv2.imdecode(image, cv2.IMREAD_COLOR) bits = self._get_bits(image) bytes_ = self._get_bytes(bits) chars = [] chars.append(self._convert_bytes_to_text(bytes_)) flag = (''.join(chars).encode('ascii', 'ignore')) if (len(flag) > 1): ans = True self.event['lsb'] = ans #print("This Image might have something stored in") else: extract_file = strelka.File(source=self.name) self.event['lsb'] = ans
def scan(self, data, file, options, expire_at): decompressed = zlib.decompress(data) self.event["size"] = len(decompressed) extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): jtr_path = options.get('jtr_path', '/jtr/') tmp_directory = options.get('tmp_file_directory', '/tmp/') password_file = options.get('password_file', '/etc/strelka/passwords.dat') log_extracted_pws = options.get('log_pws', False) scanner_timeout = options.get('scanner_timeout', 150) brute = options.get('brute_force', False) max_length = options.get('max_length', 5) with io.BytesIO(data) as doc_io: msoff_doc = msoffcrypto.OfficeFile(doc_io) output_doc = io.BytesIO() if extracted_pw := crack_word(self, data, jtr_path, tmp_directory, brute=brute, scanner_timeout=scanner_timeout, max_length=max_length, password_file=password_file): if log_extracted_pws: self.event['cracked_password'] = extracted_pw try: msoff_doc.load_key(password=extracted_pw.decode('utf-8')) msoff_doc.decrypt(output_doc) output_doc.seek(0) extract_data = output_doc.read() output_doc.seek(0) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except: self.flags.append( 'Could not decrypt document with recovered password') else:
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) with io.BytesIO(data) as docx_io: docx_doc = docx.Document(docx_io) self.event['author'] = docx_doc.core_properties.author self.event['category'] = docx_doc.core_properties.category self.event['comments'] = docx_doc.core_properties.comments self.event[ 'content_status'] = docx_doc.core_properties.content_status if docx_doc.core_properties.created is not None: self.event[ 'created'] = docx_doc.core_properties.created.isoformat() self.event['identifier'] = docx_doc.core_properties.identifier self.event['keywords'] = docx_doc.core_properties.keywords self.event['language'] = docx_doc.core_properties.language self.event[ 'last_modified_by'] = docx_doc.core_properties.last_modified_by if docx_doc.core_properties.last_printed is not None: self.event[ 'last_printed'] = docx_doc.core_properties.last_printed.isoformat( ) if docx_doc.core_properties.modified is not None: self.event[ 'modified'] = docx_doc.core_properties.modified.isoformat( ) self.event['revision'] = docx_doc.core_properties.revision self.event['subject'] = docx_doc.core_properties.subject self.event['title'] = docx_doc.core_properties.title self.event['version'] = docx_doc.core_properties.version if extract_text: extract_file = strelka.File( name='text', source=self.name, ) for paragraph in docx_doc.paragraphs: self.upload_to_cache( extract_file.pointer, paragraph.text, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): self.event['total'] = {'parts': 0, 'extracted': 0} try: message = email.message_from_string(data.decode( 'UTF-8', 'replace')) self.event.setdefault('headers', []) for (key, value) in message.items(): normalized_value = strelka.normalize_whitespace(value.strip()) header_entry = {'header': key, 'value': normalized_value} if header_entry not in self.event['headers']: self.event['headers'].append(header_entry) self.event.setdefault('parts', []) for (index, part) in enumerate(message.walk()): self.event['total']['parts'] += 1 extract_data = part.get_payload(decode=True) if extract_data is not None: part_filename = part.get_filename() if part_filename is not None: extract_name = f'{part_filename}' self.event['parts'].append(part_filename) else: extract_name = f'part_{index}' extract_file = strelka.File( name=extract_name, source=self.name, ) extract_file.add_flavors( {'external': [part.get_content_type()]}) for c in strelka.chunk_string(extract_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except AssertionError: self.flags.append('assertion_error')
def scan(self, data, file, options, expire_at): expectedSize = int.from_bytes(data[2:6], "little") actualSize = len(data) if expectedSize != actualSize: self.event['trailer_index'] = expectedSize trailer_bytes_data = data[expectedSize:] extract_file = strelka.File( source=self.name, ) for c in strelka.chunk_string(trailer_bytes_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.event['BMP_EOF'] = data[expectedSize:] self.files.append(extract_file) else: self.flags.append('no_trailer')
def scan(self, data, file, options, expire_at): if not data.endswith(b'\xff\xd9'): trailer_index = data.rfind(b'\xff\xd9') if trailer_index == -1: self.flags.append('no_trailer') else: trailer_data = data[trailer_index + 2:] if trailer_data: self.event['trailer_index'] = trailer_index extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(trailer_data): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file)
def scan(self, data, file, options, expire_at): datalen = len(data) if (data[datalen - 1] == b'\x82') and (data[datalen - 2] == b'\x60') and (data[len(data) - 3] == b'\x42'): # file DOES NOT have data after EOF, found end of file self.flags.append('no_trailer') else: # the file DOES have data after EOF, did not find end of file trailer_index = data.rfind(b'\x42\x60\x82') if trailer_index == -1: self.event[ 'end_index'] = -1 # didn't find the offical ending of the file else: trailer_index = trailer_index + 3 self.event['trailer_index'] = trailer_index extract_file = strelka.File(source=self.name) self.event['PNG_EOF'] = data[trailer_index:] self.files.append(extract_file)
def scan(self, data, file, options, expire_at): with io.BytesIO(data) as bzip2_io: with bz2.BZ2File(filename=bzip2_io) as bzip2_obj: try: decompressed = bzip2_obj.read() self.event['size'] = len(decompressed) extract_file = strelka.File(source=self.name, ) for c in strelka.chunk_string(decompressed): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) except EOFError: self.flags.append('eof_error') except OSError: self.flags.append('os_error')
def scan(self, data, file, options, expire_at): file_limit = options.get('limit', 1000) self.event['total'] = {'files': 0, 'extracted': 0} with io.BytesIO(data) as tar_io: try: with tarfile.open(fileobj=tar_io) as tar_obj: tar_members = tar_obj.getmembers() self.event['total']['files'] = len(tar_members) for tar_member in tar_members: if tar_member.isfile: if self.event['total']['extracted'] >= file_limit: break try: tar_file = tar_obj.extractfile(tar_member) if tar_file is not None: extract_file = strelka.File( name=tar_member.name, source=self.name, ) for c in strelka.chunk_string( tar_file.read()): self.upload_to_cache( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) self.event['total']['extracted'] += 1 except KeyError: self.flags.append('key_error') except tarfile.ReadError: self.flags.append('tarfile_read_error')
def scan(self, data, file, options, expire_at): extract_text = options.get('extract_text', False) tmp_directory = options.get('tmp_directory', '/tmp/') with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_data: tmp_data.write(data) tmp_data.flush() with tempfile.NamedTemporaryFile(dir=tmp_directory) as tmp_tess: tess_return = subprocess.call( ['tesseract', tmp_data.name, tmp_tess.name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL ) tess_txt_name = f'{tmp_tess.name}.txt' if tess_return == 0: with open(tess_txt_name, 'rb') as tess_txt: ocr_file = tess_txt.read() if ocr_file: self.event['text'] = ocr_file.split() if extract_text: extract_file = strelka.File( name='text', source=self.name, ) for c in strelka.chunk_string(ocr_file): self.upload_to_coordinator( extract_file.pointer, c, expire_at, ) self.files.append(extract_file) else: self.flags.append('return_code_{tess_return}') os.remove(tess_txt_name)