async def mock_take_snapshot(*args, **kwargs): screenshot = Screenshot() screenshot.data = "" return SnapshotResult( snapshot=Snapshot( url="https://www.w3.org/", submitted_url="https://www.w3.org", status=200, hostname="example.com", ip_address="1.1.1.1", asn="AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body='<html><body><script type="text/javascript" src="/2008/site/js/main"></body></html>', sha256="fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", screenshot=Screenshot(data=""), whois="foo", request={}, ), screenshot=screenshot, scripts=[ Script( url="https://www.w3.org/2008/site/js/main", content="foo", sha256="dummy", ) ], )
async def snapshots_setup(client): for i in range(1, 11): snapshot = Snapshot( url=f"http://example{i}.com/", submitted_url=f"http://example{i}.com", status=200, hostname="example.com", ip_address="1.1.1.1", asn= "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body="foo bar", sha256= "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", whois="foo", request={}, created_at=datetime.datetime.now(), ) await snapshot.save() screenshot = Screenshot() screenshot.data = "" screenshot.snapshot_id = snapshot.id await screenshot.save()
async def make_snapshot_result() -> SnapshotResult: screenshot = Screenshot() screenshot.data = "" return SnapshotResult( snapshot=Snapshot( id=uuid.uuid4(), url=f"http://example.com/", submitted_url=f"http://example.com", status=200, hostname="example.com", ip_address="1.1.1.1", asn= "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business", server="ECS (sjc/4E5D)", content_type="text/html; charset=UTF-8", content_length=1256, headers={}, body="foo bar", sha256= "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75", screenshot="yoyo", whois="foo", request={}, created_at=datetime.datetime.now(), ), screenshot=screenshot, scripts=[], )
async def preview(hostname: str) -> Screenshot: async def _preview(hostname: str, protocol="http") -> Screenshot: try: async with async_playwright() as p: browser = await launch_browser(p) page = await browser.newPage() # try with http await page.goto( f"{protocol}://{hostname}", waitUntil=settings.BROWSER_WAIT_UNTIL, ) screenshot_data = await page.screenshot() await browser.close() screenshot = Screenshot() screenshot.data = base64.b64encode( screenshot_data).decode() return screenshot except Error as e: raise (e) try: return await _preview(hostname, "http") except Error: pass try: return await _preview(hostname, "https") except Error: screenshot = Screenshot() screenshot.data = "" return screenshot
async def preview(hostname: str) -> Screenshot: async def _preview(hostname: str, protocol="http") -> Screenshot: try: browser = await launch_browser() page = await browser.newPage() # try with http await page.goto(f"{protocol}://{hostname}", wailtUntil=settings.BROWSER_WAIT_UNTIL) screenshot_data = await page.screenshot(encoding="base64") await browser.close() screenshot = Screenshot() screenshot.data = cast(str, screenshot_data) return screenshot except PyppeteerError as e: raise (e) try: return await _preview(hostname, "http") except PyppeteerError: pass try: return await _preview(hostname, "https") except PyppeteerError: screenshot = Screenshot() screenshot.data = "" return screenshot
async def take_snapshot( url: str, accept_language: Optional[str] = None, host: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by httpx Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) host {Optional[str]} -- Host header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url verify = not ignore_https_errors try: # default timeout = 30 seconds timeout = int(timeout / 1000) if timeout is not None else 30 headers = { "user-agent": user_agent or DEFAULT_UA, "accept-language": accept_language or DEFAULT_AL, "referer": referer or DEFAULT_REFERER, } if host is not None: headers["host"] = host client = httpx.AsyncClient(verify=verify) res = await client.get( url, headers=headers, timeout=timeout, allow_redirects=True, ) request = { "accept_language": accept_language, "browser": "httpx", "host": host, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": user_agent, } url = str(res.url) status = res.status_code body = res.text sha256 = calculate_sha256(body) headers = {k.lower(): v for (k, v) in res.headers.items()} except httpx.HTTPError as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = "" # get scripts scripts = cast(List[Script], await ScriptTask.process(snapshot, insert_to_db=False)) return SnapshotResult(screenshot=screenshot, snapshot=snapshot, scripts=scripts)
async def take_snapshot( url: str, accept_language: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url try: async with async_playwright() as p: browser: playwright.browser.Browser = await launch_browser(p) page: Page = await browser.newPage( ignoreHTTPSErrors=ignore_https_errors, userAgent=user_agent) headers = {} if accept_language is not None: headers["Accept-Language"] = accept_language await page.setExtraHTTPHeaders(headers) # intercept responses on page to get scripts scripts: List[Script] = [] async def handle_response(response: Response) -> None: content_type: str = response.headers.get( "content-type", "") if response.ok and is_js_content_type(content_type): content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), )) page.on( "response", lambda response: asyncio.create_task( handle_response(response)), ) # default timeout = 30 seconds timeout = timeout or 30 * 1000 res: Response = await page.goto( url, referer=referer, timeout=timeout, waitUntil=settings.BROWSER_WAIT_UNTIL, ) request = { "accept_language": accept_language, "browser": browser.version, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": await page.evaluate("() => navigator.userAgent"), } url = page.url status = res.status screenshot_data = await page.screenshot() body = await page.content() sha256 = calculate_sha256(body) headers = res.headers await browser.close() except Error as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = base64.b64encode(screenshot_data).decode() return SnapshotResult( screenshot=screenshot, snapshot=snapshot, scripts=scripts, )
async def mock_preview(hostname: str): s = Screenshot() s.data = "" return s
async def import_as_snapshot(cls, uuid: str) -> SnapshotResult: """Import urlscan.io scan as a snapshot Arguments: uuid {str} -- Scan ID Returns: Snapshot -- Snapshot ORM instance """ instance = cls(uuid) result = await instance.result() requests = result.get("data", {}).get("requests", []) response = {} for request in requests: tmp = request.get("response", {}).get("response", {}) if tmp.get("status") == 200: response = tmp break url = result.get("page", {}).get("url") submitted_url = result.get("task", {}).get("url") hostname = result.get("page", {}).get("domain") ip_address = result.get("page", {}).get("ip") asn = result.get("page", {}).get("asn") asnname = result.get("page", {}).get("asnname") headers = response.get("headers", {}) server = result.get("page", {}).get("server") content_type = headers.get("Content-Type") or headers.get( "content-type") content_length = headers.get("Content-Length") or headers.get( "content-length") body = await instance.body() sha256 = result.get("lists", {}).get("hashes", [])[0] time = cast(str, result.get("task", {}).get("time")) created_at = datetime.datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ") snapshot = Snapshot( url=url, submitted_url=submitted_url, status=200, hostname=hostname, ip_address=ip_address, asn=f"{asn} {asnname}", server=server, content_type=content_type, content_length=content_length, headers=headers, body=body, sha256=sha256, created_at=created_at, request={"urlscan.io": uuid}, ) data = await instance.screenshot() screenshot = Screenshot() screenshot.data = data return SnapshotResult(screenshot=screenshot, snapshot=snapshot, scripts=[])