async def make_snapshot_result() -> SnapshotResult: screenshot = Screenshot() screenshot.data = "" return SnapshotResult( snapshot=Snapshot( id=uuid.uuid4(), url=f"http://example.com/", submitted_url=f"http://example.com", status=200, hostname="example.com", ip_address="1.1.1.1", asn= "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body="foo bar", sha256= "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", screenshot="yoyo", whois="foo", request={}, created_at=datetime.datetime.now(), ), screenshot=screenshot, scripts=[], )
async def mock_take_snapshot(*args, **kwargs): screenshot = Screenshot() screenshot.data = "" return SnapshotResult( snapshot=Snapshot( url="https://www.w3.org/", submitted_url="https://www.w3.org", status=200, hostname="example.com", ip_address="1.1.1.1", asn="AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body='<html><body><script type="text/javascript" src="/2008/site/js/main"></body></html>', sha256="fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", screenshot=Screenshot(data=""), whois="foo", request={}, ), screenshot=screenshot, scripts=[ Script( url="https://www.w3.org/2008/site/js/main", content="foo", sha256="dummy", ) ], )
async def snapshots_setup(client): for i in range(1, 11): snapshot = Snapshot( url=f"http://example{i}.com/", submitted_url=f"http://example{i}.com", status=200, hostname="example.com", ip_address="1.1.1.1", asn= "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body="foo bar", sha256= "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", whois="foo", request={}, created_at=datetime.datetime.now(), ) await snapshot.save() screenshot = Screenshot() screenshot.data = "" screenshot.snapshot_id = snapshot.id await screenshot.save()
def import_as_snapshot(cls, uuid: str) -> Snapshot: """Import urlscan.io scan as a snapshot Arguments: uuid {str} -- Scan ID Returns: Snapshot -- Snapshot ORM instance """ instance = cls(uuid) result = instance.result() requests = result.get("data", {}).get("requests", []) response = {} for request in requests: tmp = request.get("response", {}).get("response", {}) if tmp.get("status") == 200: response = tmp break url = result.get("page", {}).get("url") submitted_url = result.get("task", {}).get("url") hostname = result.get("page", {}).get("domain") ip_address = result.get("page", {}).get("ip") asn = result.get("page", {}).get("asn") asnname = result.get("page", {}).get("asnname") headers = response.get("headers", {}) server = result.get("page", {}).get("server") content_type = headers.get("Content-Type") or headers.get( "content-type") content_length = headers.get("Content-Length") or headers.get( "content-length") body = instance.body() sha256 = result.get("lists", {}).get("hashes", [])[0] screenshot = instance.screenshot() time = result.get("task", {}).get("time") created_at = datetime.datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ") return Snapshot( url=url, submitted_url=submitted_url, status=200, hostname=hostname, ip_address=ip_address, asn=f"{asn} {asnname}", server=server, content_type=content_type, content_length=content_length, headers=headers, body=body, sha256=sha256, screenshot=screenshot, created_at=created_at, )
async def take_snapshot( url: str, accept_language: Optional[str] = None, host: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by httpx Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) host {Optional[str]} -- Host header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url verify = not ignore_https_errors try: # default timeout = 30 seconds timeout = int(timeout / 1000) if timeout is not None else 30 headers = { "user-agent": user_agent or DEFAULT_UA, "accept-language": accept_language or DEFAULT_AL, "referer": referer or DEFAULT_REFERER, } if host is not None: headers["host"] = host client = httpx.AsyncClient(verify=verify) res = await client.get( url, headers=headers, timeout=timeout, allow_redirects=True, ) request = { "accept_language": accept_language, "browser": "httpx", "host": host, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": user_agent, } url = str(res.url) status = res.status_code body = res.text sha256 = calculate_sha256(body) headers = {k.lower(): v for (k, v) in res.headers.items()} except httpx.HTTPError as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = "" # get scripts scripts = cast(List[Script], await ScriptTask.process(snapshot, insert_to_db=False)) return SnapshotResult(screenshot=screenshot, snapshot=snapshot, scripts=scripts)
async def take_snapshot( url: str, accept_language: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url try: async with async_playwright() as p: browser: playwright.browser.Browser = await launch_browser(p) page: Page = await browser.newPage( ignoreHTTPSErrors=ignore_https_errors, userAgent=user_agent) headers = {} if accept_language is not None: headers["Accept-Language"] = accept_language await page.setExtraHTTPHeaders(headers) # intercept responses on page to get scripts scripts: List[Script] = [] async def handle_response(response: Response) -> None: content_type: str = response.headers.get( "content-type", "") if response.ok and is_js_content_type(content_type): content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), )) page.on( "response", lambda response: asyncio.create_task( handle_response(response)), ) # default timeout = 30 seconds timeout = timeout or 30 * 1000 res: Response = await page.goto( url, referer=referer, timeout=timeout, waitUntil=settings.BROWSER_WAIT_UNTIL, ) request = { "accept_language": accept_language, "browser": browser.version, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": await page.evaluate("() => navigator.userAgent"), } url = page.url status = res.status screenshot_data = await page.screenshot() body = await page.content() sha256 = calculate_sha256(body) headers = res.headers await browser.close() except Error as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = base64.b64encode(screenshot_data).decode() return SnapshotResult( screenshot=screenshot, snapshot=snapshot, scripts=scripts, )
async def take_snapshot( url: str, user_agent: Optional[str] = None, timeout: Optional[int] = None, ignore_https_errors: bool = False, ) -> Snapshot: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: user_agent {Optional[str]} -- User agent to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) Returns: Snapshot -- Snapshot ORM instance """ submitted_url: str = url try: browser = await launch( headless=True, ignoreHTTPSErrors=ignore_https_errors, args=["--no-sandbox"], ) page = await browser.newPage() if user_agent is not None: await page.setUserAgent(user_agent) # default timeout = 30 seconds timeout = timeout if timeout is not None else 30 * 1000 res = await page.goto(url, timeout=timeout) request = { "browser": await browser.version(), "ignore_https_errors": ignore_https_errors, "timeout": timeout, "user_agent": user_agent or await browser.userAgent(), } url = page.url status = res.status screenshot = await page.screenshot(encoding="base64") body = await page.content() sha256 = calculate_sha256(body) headers = res.headers except PyppeteerError as e: await browser.close() raise (e) else: await browser.close() finally: if browser is not None: await browser.close() server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, screenshot=screenshot, ) return snapshot