Example #1
0
 def hostname_must_resolvable(cls, v):
     hostname = cast(str, get_hostname_from_url(v))
     ip_address = get_ip_address_by_hostname(hostname)
     if ip_address is None:
         raise ValueError(f"Cannot resolve hostname: {hostname}.")
     return v
Example #2
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by puppeteer

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        try:
            async with async_playwright() as p:
                browser: playwright.browser.Browser = await launch_browser(p)
                page: Page = await browser.newPage(
                    ignoreHTTPSErrors=ignore_https_errors,
                    userAgent=user_agent)

                headers = {}
                if accept_language is not None:
                    headers["Accept-Language"] = accept_language
                await page.setExtraHTTPHeaders(headers)

                # intercept responses on page to get scripts
                scripts: List[Script] = []

                async def handle_response(response: Response) -> None:
                    content_type: str = response.headers.get(
                        "content-type", "")
                    if response.ok and is_js_content_type(content_type):
                        content = await response.text()
                        scripts.append(
                            Script(
                                url=response.url,
                                content=content,
                                sha256=calculate_sha256(content),
                            ))

                page.on(
                    "response",
                    lambda response: asyncio.create_task(
                        handle_response(response)),
                )

                # default timeout = 30 seconds
                timeout = timeout or 30 * 1000
                res: Response = await page.goto(
                    url,
                    referer=referer,
                    timeout=timeout,
                    waitUntil=settings.BROWSER_WAIT_UNTIL,
                )

                request = {
                    "accept_language": accept_language,
                    "browser": browser.version,
                    "ignore_https_errors": ignore_https_errors,
                    "referer": referer,
                    "timeout": timeout,
                    "user_agent": await
                    page.evaluate("() => navigator.userAgent"),
                }

                url = page.url
                status = res.status
                screenshot_data = await page.screenshot()
                body = await page.content()
                sha256 = calculate_sha256(body)
                headers = res.headers

                await browser.close()
        except Error as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = base64.b64encode(screenshot_data).decode()

        return SnapshotResult(
            screenshot=screenshot,
            snapshot=snapshot,
            scripts=scripts,
        )
Example #3
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        host: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by httpx

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            host {Optional[str]} -- Host header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        verify = not ignore_https_errors

        try:
            # default timeout = 30 seconds
            timeout = int(timeout / 1000) if timeout is not None else 30

            headers = {
                "user-agent": user_agent or DEFAULT_UA,
                "accept-language": accept_language or DEFAULT_AL,
                "referer": referer or DEFAULT_REFERER,
            }
            if host is not None:
                headers["host"] = host

            client = httpx.AsyncClient(verify=verify)
            res = await client.get(
                url,
                headers=headers,
                timeout=timeout,
                allow_redirects=True,
            )

            request = {
                "accept_language": accept_language,
                "browser": "httpx",
                "host": host,
                "ignore_https_errors": ignore_https_errors,
                "referer": referer,
                "timeout": timeout,
                "user_agent": user_agent,
            }

            url = str(res.url)
            status = res.status_code
            body = res.text
            sha256 = calculate_sha256(body)
            headers = {k.lower(): v for (k, v) in res.headers.items()}
        except httpx.HTTPError as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = ""

        # get scripts
        scripts = cast(List[Script], await
                       ScriptTask.process(snapshot, insert_to_db=False))

        return SnapshotResult(screenshot=screenshot,
                              snapshot=snapshot,
                              scripts=scripts)
Example #4
0
    async def take_snapshot(
        url: str,
        user_agent: Optional[str] = None,
        timeout: Optional[int] = None,
        ignore_https_errors: bool = False,
    ) -> Snapshot:
        """Take a snapshot of a website by puppeteer

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            user_agent {Optional[str]} -- User agent to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})

        Returns:
            Snapshot -- Snapshot ORM instance
        """
        submitted_url: str = url
        try:
            browser = await launch(
                headless=True,
                ignoreHTTPSErrors=ignore_https_errors,
                args=["--no-sandbox"],
            )
            page = await browser.newPage()

            if user_agent is not None:
                await page.setUserAgent(user_agent)

            # default timeout = 30 seconds
            timeout = timeout if timeout is not None else 30 * 1000
            res = await page.goto(url, timeout=timeout)

            request = {
                "browser": await browser.version(),
                "ignore_https_errors": ignore_https_errors,
                "timeout": timeout,
                "user_agent": user_agent or await browser.userAgent(),
            }

            url = page.url
            status = res.status
            screenshot = await page.screenshot(encoding="base64")
            body = await page.content()
            sha256 = calculate_sha256(body)
            headers = res.headers
        except PyppeteerError as e:
            await browser.close()
            raise (e)
        else:
            await browser.close()
        finally:
            if browser is not None:
                await browser.close()

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address)
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
            screenshot=screenshot,
        )

        return snapshot