Exemple #1
0
 async def handle_response(response: Response) -> None:
     content_type: str = response.headers.get(
         "content-type", "")
     if response.ok and is_js_content_type(content_type):
         content = await response.text()
         scripts.append(
             Script(
                 url=response.url,
                 content=content,
                 sha256=calculate_sha256(content),
             ))
Exemple #2
0
            async def response_handler(response: Response) -> None:
                if not response.ok:
                    return

                content_type: str = response.headers.get("content-type", "")
                if not is_js_content_type(content_type):
                    return

                content = await response.text()
                scripts.append(
                    Script(
                        url=response.url,
                        content=content,
                        sha256=calculate_sha256(content),
                    ))
Exemple #3
0
    def build_from_snapshot(snapshot: Snapshot) -> List[Script]:
        sources = get_script_sources(url=snapshot.url, body=snapshot.body)
        scripts = []
        for source in sources:
            content = get_script_content(source)
            if content is None:
                continue

            script = Script(
                url=source,
                content=content,
                sha256=calculate_sha256(content),
                # insert a dummy ID if a snapshot doesn't have ID
                snapshot_id=snapshot.id or -1,
            )
            scripts.append(script)
        return scripts
Exemple #4
0
    async def from_snapshot(snapshot: Snapshot) -> List[Script]:
        sources = get_script_sources(url=snapshot.url, body=snapshot.body)
        scripts = []

        # Use the same settings as the original request
        headers = {
            "accept_language": snapshot.request.get("accept_language"),
            "host": snapshot.request.get("host"),
            "user_agent": snapshot.request.get("user_agent"),
        }
        # Remove none value
        headers = {k: v for k, v in headers.items() if v is not None}

        ignore_https_errors = snapshot.request.get("ignore_https_errors")
        verify = not ignore_https_errors

        async with httpx.AsyncClient(verify=verify) as client:
            # Get sources
            tasks = [
                partial(get_script_content, client, source, headers)
                for source in sources
            ]
            if len(tasks) <= 0:
                return []

            results = await aiometer.run_all(tasks, max_at_once=MAX_AT_ONCE)
            for result in results:
                if result is None:
                    continue

                script = Script(
                    url=result.source,
                    content=result.content,
                    sha256=calculate_sha256(result.content),
                    # insert a dummy ID if a snapshot doesn't have ID
                    snapshot_id=snapshot.id or -1,
                )
                scripts.append(script)
        return scripts
Exemple #5
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        host: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by httpx

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            host {Optional[str]} -- Host header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        verify = not ignore_https_errors

        try:
            # default timeout = 30 seconds
            timeout = int(timeout / 1000) if timeout is not None else 30

            headers = {
                "user-agent": user_agent or DEFAULT_UA,
                "accept-language": accept_language or DEFAULT_AL,
                "referer": referer or DEFAULT_REFERER,
            }
            if host is not None:
                headers["host"] = host

            client = httpx.AsyncClient(verify=verify)
            res = await client.get(
                url,
                headers=headers,
                timeout=timeout,
                allow_redirects=True,
            )

            request = {
                "accept_language": accept_language,
                "browser": "httpx",
                "host": host,
                "ignore_https_errors": ignore_https_errors,
                "referer": referer,
                "timeout": timeout,
                "user_agent": user_agent,
            }

            url = str(res.url)
            status = res.status_code
            body = res.text
            sha256 = calculate_sha256(body)
            headers = {k.lower(): v for (k, v) in res.headers.items()}
        except httpx.HTTPError as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = ""

        # get scripts
        scripts = cast(List[Script], await
                       ScriptTask.process(snapshot, insert_to_db=False))

        return SnapshotResult(screenshot=screenshot,
                              snapshot=snapshot,
                              scripts=scripts)
Exemple #6
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by puppeteer

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        try:
            async with async_playwright() as p:
                browser: playwright.browser.Browser = await launch_browser(p)
                page: Page = await browser.newPage(
                    ignoreHTTPSErrors=ignore_https_errors,
                    userAgent=user_agent)

                headers = {}
                if accept_language is not None:
                    headers["Accept-Language"] = accept_language
                await page.setExtraHTTPHeaders(headers)

                # intercept responses on page to get scripts
                scripts: List[Script] = []

                async def handle_response(response: Response) -> None:
                    content_type: str = response.headers.get(
                        "content-type", "")
                    if response.ok and is_js_content_type(content_type):
                        content = await response.text()
                        scripts.append(
                            Script(
                                url=response.url,
                                content=content,
                                sha256=calculate_sha256(content),
                            ))

                page.on(
                    "response",
                    lambda response: asyncio.create_task(
                        handle_response(response)),
                )

                # default timeout = 30 seconds
                timeout = timeout or 30 * 1000
                res: Response = await page.goto(
                    url,
                    referer=referer,
                    timeout=timeout,
                    waitUntil=settings.BROWSER_WAIT_UNTIL,
                )

                request = {
                    "accept_language": accept_language,
                    "browser": browser.version,
                    "ignore_https_errors": ignore_https_errors,
                    "referer": referer,
                    "timeout": timeout,
                    "user_agent": await
                    page.evaluate("() => navigator.userAgent"),
                }

                url = page.url
                status = res.status
                screenshot_data = await page.screenshot()
                body = await page.content()
                sha256 = calculate_sha256(body)
                headers = res.headers

                await browser.close()
        except Error as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = base64.b64encode(screenshot_data).decode()

        return SnapshotResult(
            screenshot=screenshot,
            snapshot=snapshot,
            scripts=scripts,
        )
Exemple #7
0
    async def take_snapshot(
        url: str,
        user_agent: Optional[str] = None,
        timeout: Optional[int] = None,
        ignore_https_errors: bool = False,
    ) -> Snapshot:
        """Take a snapshot of a website by puppeteer

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            user_agent {Optional[str]} -- User agent to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})

        Returns:
            Snapshot -- Snapshot ORM instance
        """
        submitted_url: str = url
        try:
            browser = await launch(
                headless=True,
                ignoreHTTPSErrors=ignore_https_errors,
                args=["--no-sandbox"],
            )
            page = await browser.newPage()

            if user_agent is not None:
                await page.setUserAgent(user_agent)

            # default timeout = 30 seconds
            timeout = timeout if timeout is not None else 30 * 1000
            res = await page.goto(url, timeout=timeout)

            request = {
                "browser": await browser.version(),
                "ignore_https_errors": ignore_https_errors,
                "timeout": timeout,
                "user_agent": user_agent or await browser.userAgent(),
            }

            url = page.url
            status = res.status
            screenshot = await page.screenshot(encoding="base64")
            body = await page.content()
            sha256 = calculate_sha256(body)
            headers = res.headers
        except PyppeteerError as e:
            await browser.close()
            raise (e)
        else:
            await browser.close()
        finally:
            if browser is not None:
                await browser.close()

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address)
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
            screenshot=screenshot,
        )

        return snapshot