async def handle_response(response: Response) -> None: content_type: str = response.headers.get( "content-type", "") if response.ok and is_js_content_type(content_type): content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), ))
async def response_handler(response: Response) -> None: if not response.ok: return content_type: str = response.headers.get("content-type", "") if not is_js_content_type(content_type): return content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), ))
def build_from_snapshot(snapshot: Snapshot) -> List[Script]: sources = get_script_sources(url=snapshot.url, body=snapshot.body) scripts = [] for source in sources: content = get_script_content(source) if content is None: continue script = Script( url=source, content=content, sha256=calculate_sha256(content), # insert a dummy ID if a snapshot doesn't have ID snapshot_id=snapshot.id or -1, ) scripts.append(script) return scripts
async def from_snapshot(snapshot: Snapshot) -> List[Script]: sources = get_script_sources(url=snapshot.url, body=snapshot.body) scripts = [] # Use the same settings as the original request headers = { "accept_language": snapshot.request.get("accept_language"), "host": snapshot.request.get("host"), "user_agent": snapshot.request.get("user_agent"), } # Remove none value headers = {k: v for k, v in headers.items() if v is not None} ignore_https_errors = snapshot.request.get("ignore_https_errors") verify = not ignore_https_errors async with httpx.AsyncClient(verify=verify) as client: # Get sources tasks = [ partial(get_script_content, client, source, headers) for source in sources ] if len(tasks) <= 0: return [] results = await aiometer.run_all(tasks, max_at_once=MAX_AT_ONCE) for result in results: if result is None: continue script = Script( url=result.source, content=result.content, sha256=calculate_sha256(result.content), # insert a dummy ID if a snapshot doesn't have ID snapshot_id=snapshot.id or -1, ) scripts.append(script) return scripts
async def take_snapshot( url: str, accept_language: Optional[str] = None, host: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by httpx Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) host {Optional[str]} -- Host header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url verify = not ignore_https_errors try: # default timeout = 30 seconds timeout = int(timeout / 1000) if timeout is not None else 30 headers = { "user-agent": user_agent or DEFAULT_UA, "accept-language": accept_language or DEFAULT_AL, "referer": referer or DEFAULT_REFERER, } if host is not None: headers["host"] = host client = httpx.AsyncClient(verify=verify) res = await client.get( url, headers=headers, timeout=timeout, allow_redirects=True, ) request = { "accept_language": accept_language, "browser": "httpx", "host": host, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": user_agent, } url = str(res.url) status = res.status_code body = res.text sha256 = calculate_sha256(body) headers = {k.lower(): v for (k, v) in res.headers.items()} except httpx.HTTPError as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = "" # get scripts scripts = cast(List[Script], await ScriptTask.process(snapshot, insert_to_db=False)) return SnapshotResult(screenshot=screenshot, snapshot=snapshot, scripts=scripts)
async def take_snapshot( url: str, accept_language: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url try: async with async_playwright() as p: browser: playwright.browser.Browser = await launch_browser(p) page: Page = await browser.newPage( ignoreHTTPSErrors=ignore_https_errors, userAgent=user_agent) headers = {} if accept_language is not None: headers["Accept-Language"] = accept_language await page.setExtraHTTPHeaders(headers) # intercept responses on page to get scripts scripts: List[Script] = [] async def handle_response(response: Response) -> None: content_type: str = response.headers.get( "content-type", "") if response.ok and is_js_content_type(content_type): content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), )) page.on( "response", lambda response: asyncio.create_task( handle_response(response)), ) # default timeout = 30 seconds timeout = timeout or 30 * 1000 res: Response = await page.goto( url, referer=referer, timeout=timeout, waitUntil=settings.BROWSER_WAIT_UNTIL, ) request = { "accept_language": accept_language, "browser": browser.version, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": await page.evaluate("() => navigator.userAgent"), } url = page.url status = res.status screenshot_data = await page.screenshot() body = await page.content() sha256 = calculate_sha256(body) headers = res.headers await browser.close() except Error as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = base64.b64encode(screenshot_data).decode() return SnapshotResult( screenshot=screenshot, snapshot=snapshot, scripts=scripts, )
async def take_snapshot( url: str, user_agent: Optional[str] = None, timeout: Optional[int] = None, ignore_https_errors: bool = False, ) -> Snapshot: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: user_agent {Optional[str]} -- User agent to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) Returns: Snapshot -- Snapshot ORM instance """ submitted_url: str = url try: browser = await launch( headless=True, ignoreHTTPSErrors=ignore_https_errors, args=["--no-sandbox"], ) page = await browser.newPage() if user_agent is not None: await page.setUserAgent(user_agent) # default timeout = 30 seconds timeout = timeout if timeout is not None else 30 * 1000 res = await page.goto(url, timeout=timeout) request = { "browser": await browser.version(), "ignore_https_errors": ignore_https_errors, "timeout": timeout, "user_agent": user_agent or await browser.userAgent(), } url = page.url status = res.status screenshot = await page.screenshot(encoding="base64") body = await page.content() sha256 = calculate_sha256(body) headers = res.headers except PyppeteerError as e: await browser.close() raise (e) else: await browser.close() finally: if browser is not None: await browser.close() server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, screenshot=screenshot, ) return snapshot