def handle_form(self, form): from .Window import Window log.info(form) action = form.get('action', None) if action is None: return _action = log.HTTPSession.normalize_url(self.window, action) if _action is None: return if _action in self.forms: return self.forms.append(_action) method = form.get('method', 'get') payload = None for child in form.find_all(): name = getattr(child, 'name', None) if name.lower() in ('input', ): if payload is None: payload = dict() if all(p in child.attrs for p in ('name', 'value', )): payload[child.attrs['name']] = child.attrs['value'] headers = dict() headers['Content-Type'] = 'application/x-www-form-urlencoded' try: response = self.window._navigator.fetch(action, headers = headers, method = method.upper(), body = payload, redirect_type = "form") except Exception: return if response is None: return if response.status_code == 404: return ctype = response.headers.get('content-type', None) if ctype: handler = log.MIMEHandler.get_handler(ctype) if handler and handler(action, response.content): return doc = w3c.parseString(response.content) window = Window(_action, doc, personality = log.ThugOpts.useragent) dft = DFT(window, forms = self.forms) dft.run()
def set_href(self, url): from .Window import Window if url.startswith("data:"): log.DFT._handle_data_uri(url) return referer = self._window.url if referer == url: log.warning("Detected redirection from %s to %s... skipping", referer, url) return for p in log.ThugOpts.Personality: if log.ThugOpts.Personality[p]['userAgent'] == self._window._navigator.userAgent: break url = log.HTTPSession.normalize_url(self._window, url) log.ThugLogging.log_href_redirect(referer, url) doc = w3c.parseString('') window = Window(referer, doc, personality = p) # pylint:disable=undefined-loop-variable window = window.open(url) if not window: return # self._window.url = url dft = DFT(window) dft.run()
def do_handle_form(self, form): from .Window import Window log.info(form) action = form.get('action', None) if action in (None, 'self', ): # pragma: no cover last_url = getattr(log, 'last_url', None) action = last_url if last_url else self.window.url if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_url_count() _action = log.HTTPSession.normalize_url(self.window, action) if _action is None: # pragma: no cover return if _action not in self.forms: self.forms.append(_action) method = form.get('method', 'get') payload = None for child in form.find_all(): name = getattr(child, 'name', None) if name.lower() in ('input', ): if payload is None: payload = dict() if all(p in child.attrs for p in ('name', 'value', )): payload[child.attrs['name']] = child.attrs['value'] headers = dict() headers['Content-Type'] = 'application/x-www-form-urlencoded' try: response = self.window._navigator.fetch(action, headers = headers, method = method.upper(), body = payload, redirect_type = "form") except Exception as e: # pragma: no cover log.info("[ERROR][do_handle_form] %s", str(e)) return if response is None or not response.ok: return if getattr(response, 'thug_mimehandler_hit', False): # pragma: no cover return doc = w3c.parseString(response.content) window = Window(_action, doc, personality = log.ThugOpts.useragent) dft = DFT(window, forms = self.forms) dft.run()
def handle_meta_refresh(self, http_equiv, content): from .Window import Window if http_equiv.lower() != 'refresh': return if 'url' not in content.lower(): return url = None data_uri = True if 'data:' in content else False for s in content.split(';'): if data_uri is True and url is not None: url = "{};{}".format(url, s) s = s.strip() if s.lower().startswith('url='): url = s[4:] if not url: return if url.startswith("'") and url.endswith("'"): url = url[1:-1] if url in self.meta and self.meta[url] >= 3: return if data_uri: self._handle_data_uri(url) return try: response = self.window._navigator.fetch(url, redirect_type="meta") except Exception: return if response is None: return if response.status_code == 404: return if url in self.meta: self.meta[url] += 1 else: self.meta[url] = 1 doc = w3c.parseString(response.content) window = Window(self.window.url, doc, personality=log.ThugOpts.useragent) # window.open(url) dft = DFT(window) dft.run()
def _handle_data_uri(self, uri): """ Data URI Scheme data:[<MIME-type>][;charset=<encoding>][;base64],<data> The encoding is indicated by ;base64. If it is present the data is encoded as base64. Without it the data (as a sequence of octets) is represented using ASCII encoding for octets inside the range of safe URL characters and using the standard %xx hex encoding of URLs for octets outside that range. If <MIME-type> is omitted, it defaults to text/plain;charset=US-ASCII. (As a shorthand, the type can be omitted but the charset parameter supplied.) Some browsers (Chrome, Opera, Safari, Firefox) accept a non-standard ordering if both ;base64 and ;charset are supplied, while Internet Explorer requires that the charset's specification must precede the base64 token. """ if not uri.lower().startswith("data:"): return False log.URLClassifier.classify(uri) h = uri.split(",") if len(h) < 2: return False data = h[1] opts = h[0][len("data:"):].split(";") if 'base64' in opts: data = base64.b64decode(h[1]) opts.remove('base64') if not opts: opts = ["text/plain", "charset=US-ASCII"] mimetype = opts[0] if mimetype in ('text/html', ): from .Window import Window doc = w3c.parseString(data) window = Window(self.window.url, doc, personality = log.ThugOpts.useragent) # window.open(uri) dft = DFT(window) dft.run() return True handler = log.MIMEHandler.get_handler(mimetype) if handler: handler(self.window.url, data) return True return False
def follow_href(self, href): from .Window import Window doc = w3c.parseString('') window = Window(self.window.url, doc, personality = log.ThugOpts.useragent) window = window.open(href) if window: dft = DFT(window) dft.run()
def run_local(self, url): log.last_url = None log.last_url_fetched = None log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r', encoding="utf-8").read() extension = os.path.splitext(url) if len(extension) > 1 and extension[1].lower() in ( '.js', '.jse', ): if not content.lstrip().startswith('<script'): html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(content)))) else: soup = bs4.BeautifulSoup(content, "html.parser") try: soup.html.unwrap() except AttributeError: pass try: soup.head.unwrap() except AttributeError: pass try: soup.body.unwrap() except AttributeError: pass code = soup.script.get_text(types=(NavigableString, CData, Script)) html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(code)))) else: html = content if log.ThugOpts.features_logging: log.ThugLogging.Features.add_characters_count(len(html)) whitespaces_count = len([ a for a in html if isinstance(a, six.string_types) and a.isspace() ]) log.ThugLogging.Features.add_whitespaces_count(whitespaces_count) doc = w3c.parseString(html) window = Window('about:blank', doc, personality=log.ThugOpts.useragent) window.open() self.__run(window)
def run_local(self, url): log.last_url = None log.last_url_fetched = None log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) encoding = cchardet.detect(content) if len(extension) > 1 and extension[1].lower() in ( '.js', '.jse', ): if not content.lstrip().startswith('<script'): html = tostring( E.HTML( E.HEAD(), E.BODY(E.SCRIPT(content.decode( encoding['encoding']))))) else: soup = BeautifulSoup(content, "html.parser") try: soup.html.unwrap() except AttributeError: pass try: soup.head.unwrap() except AttributeError: pass try: soup.body.unwrap() except AttributeError: pass html = tostring( E.HTML(E.HEAD(), E.BODY(E.SCRIPT(soup.script.get_text())))) else: html = content if log.ThugOpts.features_logging: log.ThugLogging.Features.add_characters_count(len(html)) log.ThugLogging.Features.add_whitespaces_count( len([a for a in html if a.isspace()])) doc = w3c.parseString(html) window = Window('about:blank', doc, personality=log.ThugOpts.useragent) window.open() self.__run(window)
def handle_meta_refresh(self, http_equiv, content): from .Window import Window if http_equiv.lower() not in ('refresh', ) or 'url' not in content.lower(): return if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_meta_refresh_count() log.ThugLogging.Features.increase_url_count() url = None data_uri = True if 'data:' in content else False for s in content.split(';'): if data_uri is True and url is not None: url = "{};{}".format(url, s) s = s.strip() if s.lower().startswith('url='): url = s[4:] if not url: # pragma: no cover return if url.startswith("'") and url.endswith("'"): url = url[1:-1] if url in log.ThugLogging.meta and log.ThugLogging.meta[url] >= 3: # pragma: no cover return if data_uri: self._handle_data_uri(url) return try: response = self.window._navigator.fetch(url, redirect_type = "meta") except Exception as e: log.info("[ERROR][handle_meta_refresh] %s", str(e)) return if response is None or not response.ok: return if url not in log.ThugLogging.meta: log.ThugLogging.meta[url] = 0 log.ThugLogging.meta[url] += 1 doc = w3c.parseString(response.content) window = Window(self.window.url, doc, personality = log.ThugOpts.useragent) dft = DFT(window) dft.run()
def loadXML(self, bstrXML): self.xml = w3c.parseString(bstrXML) if "res://" not in bstrXML: return for p in bstrXML.split('"'): if p.startswith("res://"): log.URLClassifier.classify(p) log.ThugLogging.add_behavior_warn("[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) log.ThugLogging.log_classifier("exploit", log.ThugLogging.url, "CVE-2017-0022", None) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0
def frames(self): """an array of all the frames (including iframes) in the current window""" from thug.DOM.W3C.HTML.HTMLCollection import HTMLCollection for frame in self._findAll(['frame', 'iframe']): code = unicode(frame) if code in self._inner_frames: continue self._inner_frames.add(code) self._frames.add(Window(self.url, w3c.parseString(code), personality = log.ThugOpts.useragent)) return HTMLCollection(self.doc, list(self._frames))
def run_remote(self, url): scheme = urlparse.urlparse(url).scheme if not scheme or not scheme.startswith('http'): url = 'http://%s' % (url, ) log.ThugLogging.set_url(url) log.HTTPSession = HTTPSession() doc = w3c.parseString('') window = Window(log.ThugOpts.referer, doc, personality = log.ThugOpts.useragent) window = window.open(url) if window: self.__run(window)
def loadXML(self, bstrXML): self.xml = w3c.parseString(bstrXML) if "res://" not in bstrXML: return for p in bstrXML.split('"'): if p.startswith("res://"): log.URLClassifier.classify(p) log.ThugLogging.add_behavior_warn( "[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) log.ThugLogging.log_classifier("exploit", log.ThugLogging.url, "CVE-2017-0022", None) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0
def run_local(self, url): log.last_url = None log.last_url_fetched = None log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) encoding = cchardet.detect(content) if len(extension) > 1 and extension[1].lower() in ('.js', '.jse', ): if not content.lstrip().startswith('<script'): html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(content.decode(encoding['encoding']))))) else: soup = BeautifulSoup(content, "html.parser") try: soup.html.unwrap() except AttributeError: pass try: soup.head.unwrap() except AttributeError: pass try: soup.body.unwrap() except AttributeError: pass html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(soup.script.get_text())))) else: html = content if log.ThugOpts.features_logging: log.ThugLogging.Features.add_characters_count(len(html)) log.ThugLogging.Features.add_whitespaces_count(len([a for a in html if a.isspace()])) doc = w3c.parseString(html) window = Window('about:blank', doc, personality = log.ThugOpts.useragent) window.open() self.__run(window)
def run_local(self, url): log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) if len(extension) > 1 and extension[1].lower() in ('.js', '.jse', ): html = tostring(E.HTML(E.BODY(E.SCRIPT(content)))) else: html = content doc = w3c.parseString(html) window = Window('about:blank', doc, personality = log.ThugOpts.useragent) window.open() self.__run(window)
def loadXML(self, bstrXML): self.xml = w3c.parseString(bstrXML) #self.attributes = NamedNodeMap(self.xml._node) if "res://" not in bstrXML: return for p in bstrXML.split('"'): if p.startswith("res://"): log.ThugLogging.add_behavior_warn("[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0 for p in bstrXML.split("'"): if p.startswith("res://"): log.ThugLogging.add_behavior_warn("[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0
def run_local(self, url): log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) if len(extension) > 1 and extension[1].lower() in ('.js'): html = tostring(E.HTML(E.BODY(E.SCRIPT(content)))) else: html = content doc = w3c.parseString(html) window = Window('about:blank', doc, personality=log.ThugOpts.useragent) window.open() self.run(window)
def handle_frame(self, frame, redirect_type='frame'): from .Window import Window log.warning(frame) src = frame.get('src', None) if not src: return if self._handle_data_uri(src): return try: response = self.window._navigator.fetch( src, redirect_type=redirect_type) except Exception: return if response is None: return if response.status_code == 404: return ctype = response.headers.get('content-type', None) if ctype: handler = log.MIMEHandler.get_handler(ctype) if handler and handler(src, response.content): return _src = log.HTTPSession.normalize_url(self.window, src) if _src: src = _src doc = w3c.parseString(response.content) window = Window(response.url, doc, personality=log.ThugOpts.useragent) # window.open(src) frame_id = frame.get('id', None) if frame_id: log.ThugLogging.windows[frame_id] = window dft = DFT(window) dft.run()
def handle_frame(self, frame, redirect_type = 'frame'): from .Window import Window log.warning(frame) src = frame.get('src', None) if not src: return if self._handle_data_uri(src): return if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_url_count() try: response = self.window._navigator.fetch(src, redirect_type = redirect_type) except Exception as e: log.info("[ERROR][handle_frame] %s", str(e)) return if response is None or not response.ok: # pragma: no cover return # pragma: no cover if response.url in log.ThugLogging.frames and log.ThugLogging.frames[response.url] >= 3: return # pragma: no cover if response.url not in log.ThugLogging.frames: log.ThugLogging.frames[response.url] = 0 log.ThugLogging.frames[response.url] += 1 if getattr(response, 'thug_mimehandler_hit', False): return # pragma: no cover doc = w3c.parseString(response.content) window = Window(response.url, doc, personality = log.ThugOpts.useragent) frame_id = frame.get('id', None) if frame_id: log.ThugLogging.windows[frame_id] = window dft = DFT(window) dft.run()
def run_local(self, url): log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) if len(extension) > 1 and extension[1].lower() in ( '.js', '.jse', ): if not content.lstrip().startswith('<script'): html = tostring(E.HTML(E.HEAD(), E.BODY(E.SCRIPT(content)))) else: soup = BeautifulSoup(content, "html.parser") try: soup.html.unwrap() except AttributeError: pass try: soup.head.unwrap() except AttributeError: pass try: soup.body.unwrap() except AttributeError: pass html = tostring( E.HTML(E.HEAD(), E.BODY(E.SCRIPT(soup.script.get_text())))) else: html = content doc = w3c.parseString(html) window = Window('about:blank', doc, personality=log.ThugOpts.useragent) window.open() self.__run(window)
def loadXML(self, bstrXML): self.xml = w3c.parseString(bstrXML) # self.attributes = NamedNodeMap(self.xml._node) if "res://" not in bstrXML: return for p in bstrXML.split('"'): if p.startswith("res://"): log.URLClassifier.classify(p) log.ThugLogging.add_behavior_warn("[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0 for p in bstrXML.split("'"): if p.startswith("res://"): log.URLClassifier.classify(p) log.ThugLogging.add_behavior_warn("[Microsoft XMLDOM ActiveX] Attempting to load %s" % (p, )) if any(sys.lower() in p.lower() for sys in security_sys): self.parseError._errorCode = 0
def run_remote(self, url): log.last_url = None try: scheme = urlparse.urlparse(url).scheme except ValueError as e: log.warning("[WARNING] Analysis not performed (%s)", e.message) return if not scheme or not scheme.startswith('http'): url = 'http://%s' % (url, ) log.ThugLogging.set_url(url) log.HTTPSession = HTTPSession() doc = w3c.parseString('') window = Window(log.ThugOpts.referer, doc, personality = log.ThugOpts.useragent) window = window.open(url) if window: self.__run(window)
def search_url(self, sc): from thug.DOM.W3C import w3c from thug.DOM.Window import Window from thug.DOM.DFT import DFT offset = sc.find('http') if offset > 0: url = sc[offset:].split()[0] if url.endswith("'") or url.endswith('"'): url = url[:-1] if url in log.ThugLogging.shellcode_urls: return if url in log.ThugLogging.retrieved_urls: return log.info('[Shellcode Analysis] URL Detected: %s', url) try: response = self.window._navigator.fetch( url, redirect_type="URL found") log.ThugLogging.shellcode_urls.add(url) except Exception: return if response is None: return if not response.ok: return doc = w3c.parseString(response.content) window = Window(url, doc, personality=log.ThugOpts.useragent) dft = DFT(window) dft.run()
def run_local(self, url): log.ThugLogging.set_url(url) log.ThugOpts.local = True log.HTTPSession = HTTPSession() content = open(url, 'r').read() extension = os.path.splitext(url) if len(extension) > 1 and extension[1].lower() in ('.js', '.jse', ): if not content.lstrip().startswith('<script'): html = tostring(E.HTML(E.BODY(E.SCRIPT(content)))) else: soup = BeautifulSoup(content, "html.parser") try: soup.html.unwrap() except AttributeError: pass try: soup.head.unwrap() except AttributeError: pass try: soup.body.unwrap() except AttributeError: pass html = tostring(E.HTML(E.BODY(E.SCRIPT(soup.script.get_text())))) else: html = content doc = w3c.parseString(html) window = Window('about:blank', doc, personality = log.ThugOpts.useragent) window.open() self.__run(window)
def search_url(self, sc): from thug.DOM.W3C import w3c from thug.DOM.Window import Window from thug.DOM.DFT import DFT offset = sc.find('http') if offset > 0: url = sc[offset:].split()[0] if url.endswith("'") or url.endswith('"'): url = url[:-1] if url in log.ThugLogging.shellcode_urls: return if url in log.ThugLogging.retrieved_urls: return log.info('[Shellcode Analysis] URL Detected: %s', url) try: response = self.window._navigator.fetch(url, redirect_type = "URL found") log.ThugLogging.shellcode_urls.add(url) except Exception: return if response is None: return if not response.ok: return doc = w3c.parseString(response.content) window = Window(url, doc, personality = log.ThugOpts.useragent) dft = DFT(window) dft.run()
def handle_a(self, anchor): log.info(anchor) self.anchors.append(anchor) if not log.ThugOpts.extensive: return href = anchor.get('href', None) if not href: # pragma: no cover return if self._handle_data_uri(href): return try: response = self.window._navigator.fetch(href, redirect_type = "anchor") except Exception as e: # pragma: no cover log.info("[ERROR][handle_a] %s", str(e)) return if response is None or not response.ok: # pragma: no cover return content_type = response.headers.get('content-type', None) if not content_type: # pragma: no cover return if content_type.startswith(('text/html', )): from .Window import Window doc = w3c.parseString(response.content) window = Window(self.window.url, doc, personality = log.ThugOpts.useragent) dft = DFT(window) dft.run()
def run_remote(self, url): log.last_url = None log.last_url_fetched = None log.ThugOpts.local = False try: scheme = urlparse.urlparse(url).scheme except ValueError as e: log.warning("[WARNING] Analysis not performed (%s)", e.message) return if not scheme or not scheme.startswith('http'): url = 'http://%s' % (url, ) log.ThugLogging.set_url(url) log.HTTPSession = HTTPSession() doc = w3c.parseString('') window = Window(log.ThugOpts.referer, doc, personality = log.ThugOpts.useragent) window = window.open(url) if window: self.__run(window)
def setAttribute(self, name, value): from thug.DOM.W3C import w3c from thug.DOM.Window import Window from thug.DOM.DFT import DFT if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_setattribute_count() if not isinstance(name, six.string_types): name = str(name) if log.ThugOpts.Personality.isFirefox(): if name in ('style', ): svalue = value.split('-') _value = svalue[0] if len(svalue) > 1: _value = '{}{}'.format(_value, ''.join([s.capitalize() for s in svalue[1:]])) for css in [p for p in FF_STYLES if log.ThugOpts.Personality.browserMajorVersion >= p[0]]: if css[1] in value: self.tag.attrs[name] = _value return if name in ('type', ): for _input in [p for p in FF_INPUTS if log.ThugOpts.Personality.browserMajorVersion > p[0]]: if _input[1] in value: self.tag.attrs[name] = value return self.tag.attrs[name] = value if name.lower() in ('src', 'archive'): s = urlparse.urlsplit(value) handler = getattr(log.SchemeHandler, 'handle_%s' % (s.scheme, ), None) if handler: handler(self.doc.window, value) return try: response = self.doc.window._navigator.fetch(value, redirect_type = "element workaround") except Exception: return if response is None or not response.ok: return ctype = response.headers.get('content-type', None) if ctype is None: return handler = log.MIMEHandler.get_handler(ctype) if handler: handler(self.doc.window.url, response.content) return if ctype.startswith(('text/html', )): doc = w3c.parseString(response.content) window = Window(response.url, doc, personality = log.ThugOpts.useragent) dft = DFT(window) dft.run()
def setAttribute(self, name, value): from thug.DOM.W3C import w3c from thug.DOM.Window import Window from thug.DOM.DFT import DFT if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_setattribute_count() if not isinstance(name, six.string_types): # pragma: no cover name = str(name) if log.ThugOpts.Personality.isFirefox(): if name in ('style', ): svalue = value.split('-') _value = svalue[0] if len(svalue) > 1: _value = '{}{}'.format( _value, ''.join([s.capitalize() for s in svalue[1:]])) for css in [ p for p in FF_STYLES if log.ThugOpts.Personality.browserMajorVersion >= p[0] ]: if css[1] in value: self.tag.attrs[name] = _value return if name in ('type', ): for _input in [ p for p in FF_INPUTS if log.ThugOpts.Personality.browserMajorVersion > p[0] ]: if _input[1] in value: self.tag.attrs[name] = value return self.tag.attrs[name] = value if name.lower() in ('src', 'archive'): s = urlparse.urlsplit(value) handler = getattr(log.SchemeHandler, 'handle_%s' % (s.scheme, ), None) if handler: handler(self.doc.window, value) return try: response = self.doc.window._navigator.fetch( value, redirect_type="element workaround") except Exception: return if response is None or not response.ok: return ctype = response.headers.get('content-type', None) if ctype is None: # pragma: no cover return handler = log.MIMEHandler.get_handler(ctype) if handler: handler(self.doc.window.url, response.content) return if ctype.startswith(('text/html', )): doc = w3c.parseString(response.content) window = Window(response.url, doc, personality=log.ThugOpts.useragent) dft = DFT(window) dft.run()
def _handle_data_uri(self, uri): """ Data URI Scheme data:[<MIME-type>][;charset=<encoding>][;base64],<data> The encoding is indicated by ;base64. If it is present the data is encoded as base64. Without it the data (as a sequence of octets) is represented using ASCII encoding for octets inside the range of safe URL characters and using the standard %xx hex encoding of URLs for octets outside that range. If <MIME-type> is omitted, it defaults to text/plain;charset=US-ASCII. (As a shorthand, the type can be omitted but the charset parameter supplied.) Some browsers (Chrome, Opera, Safari, Firefox) accept a non-standard ordering if both ;base64 and ;charset are supplied, while Internet Explorer requires that the charset's specification must precede the base64 token. """ uri = uri if isinstance(uri, six.string_types) else str(uri) if not uri.lower().startswith("data:"): return None log.URLClassifier.classify(uri) if log.ThugOpts.features_logging: log.ThugLogging.Features.increase_data_uri_count() h = uri.split(",") if len(h) < 2 or not h[1]: # pragma: no cover return None data = h[1] opts = h[0][len("data:"):].split(";") if 'base64' in opts: try: data = base64.b64decode(h[1]) except Exception: # pragma: no cover try: data = base64.b64decode(urlparse.unquote(h[1])) except Exception: log.warning("[WARNING] Error while handling data URI: %s", data) return None opts.remove('base64') if not opts or not opts[0]: opts = ["text/plain", "charset=US-ASCII"] mimetype = opts[0] handler = log.MIMEHandler.get_handler(mimetype) if handler: handler(self.window.url, data) return None if mimetype.startswith(('text/html', )): from .Window import Window doc = w3c.parseString(data) window = Window(self.window.url, doc, personality = log.ThugOpts.useragent) dft = DFT(window) dft.run() return data