Ejemplo n.º 1
0
async def snapshots_setup(client):
    for i in range(1, 11):
        snapshot = Snapshot(
            url=f"http://example{i}.com/",
            submitted_url=f"http://example{i}.com",
            status=200,
            hostname="example.com",
            ip_address="1.1.1.1",
            asn=
            "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business",
            server="ECS (sjc/4E5D)",
            content_type="text/html; charset=UTF-8",
            content_length=1256,
            headers={},
            body="foo bar",
            sha256=
            "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75",
            whois="foo",
            request={},
            created_at=datetime.datetime.now(),
        )
        await snapshot.save()

        screenshot = Screenshot()
        screenshot.data = ""

        screenshot.snapshot_id = snapshot.id
        await screenshot.save()
Ejemplo n.º 2
0
async def mock_take_snapshot(*args, **kwargs):
    screenshot = Screenshot()
    screenshot.data = ""

    return SnapshotResult(
        snapshot=Snapshot(
            url="https://www.w3.org/",
            submitted_url="https://www.w3.org",
            status=200,
            hostname="example.com",
            ip_address="1.1.1.1",
            asn="AS15133 MCI Communications Services, Inc. d/b/a Verizon Business",
            server="ECS (sjc/4E5D)",
            content_type="text/html; charset=UTF-8",
            content_length=1256,
            headers={},
            body='<html><body><script type="text/javascript" src="/2008/site/js/main"></body></html>',
            sha256="fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75",
            screenshot=Screenshot(data=""),
            whois="foo",
            request={},
        ),
        screenshot=screenshot,
        scripts=[
            Script(
                url="https://www.w3.org/2008/site/js/main",
                content="foo",
                sha256="dummy",
            )
        ],
    )
Ejemplo n.º 3
0
async def make_snapshot_result() -> SnapshotResult:
    screenshot = Screenshot()
    screenshot.data = ""

    return SnapshotResult(
        snapshot=Snapshot(
            id=uuid.uuid4(),
            url=f"http://example.com/",
            submitted_url=f"http://example.com",
            status=200,
            hostname="example.com",
            ip_address="1.1.1.1",
            asn=
            "AS15133 MCI Communications Services, Inc. d/b/a Verizon Business",
            server="ECS (sjc/4E5D)",
            content_type="text/html; charset=UTF-8",
            content_length=1256,
            headers={},
            body="foo bar",
            sha256=
            "fbc1a9f858ea9e177916964bd88c3d37b91a1e84412765e29950777f265c4b75",
            screenshot="yoyo",
            whois="foo",
            request={},
            created_at=datetime.datetime.now(),
        ),
        screenshot=screenshot,
        scripts=[],
    )
Ejemplo n.º 4
0
    async def preview(hostname: str) -> Screenshot:
        async def _preview(hostname: str, protocol="http") -> Screenshot:
            try:
                async with async_playwright() as p:
                    browser = await launch_browser(p)
                    page = await browser.newPage()
                    # try with http
                    await page.goto(
                        f"{protocol}://{hostname}",
                        waitUntil=settings.BROWSER_WAIT_UNTIL,
                    )
                    screenshot_data = await page.screenshot()
                    await browser.close()

                    screenshot = Screenshot()
                    screenshot.data = base64.b64encode(
                        screenshot_data).decode()
                    return screenshot
            except Error as e:
                raise (e)

        try:
            return await _preview(hostname, "http")
        except Error:
            pass

        try:
            return await _preview(hostname, "https")
        except Error:
            screenshot = Screenshot()
            screenshot.data = ""
            return screenshot
Ejemplo n.º 5
0
    async def preview(hostname: str) -> Screenshot:
        async def _preview(hostname: str, protocol="http") -> Screenshot:
            try:
                browser = await launch_browser()
                page = await browser.newPage()
                # try with http
                await page.goto(f"{protocol}://{hostname}",
                                wailtUntil=settings.BROWSER_WAIT_UNTIL)
                screenshot_data = await page.screenshot(encoding="base64")
                await browser.close()

                screenshot = Screenshot()
                screenshot.data = cast(str, screenshot_data)
                return screenshot
            except PyppeteerError as e:
                raise (e)

        try:
            return await _preview(hostname, "http")
        except PyppeteerError:
            pass

        try:
            return await _preview(hostname, "https")
        except PyppeteerError:
            screenshot = Screenshot()
            screenshot.data = ""
            return screenshot
Ejemplo n.º 6
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        host: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by httpx

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            host {Optional[str]} -- Host header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        verify = not ignore_https_errors

        try:
            # default timeout = 30 seconds
            timeout = int(timeout / 1000) if timeout is not None else 30

            headers = {
                "user-agent": user_agent or DEFAULT_UA,
                "accept-language": accept_language or DEFAULT_AL,
                "referer": referer or DEFAULT_REFERER,
            }
            if host is not None:
                headers["host"] = host

            client = httpx.AsyncClient(verify=verify)
            res = await client.get(
                url,
                headers=headers,
                timeout=timeout,
                allow_redirects=True,
            )

            request = {
                "accept_language": accept_language,
                "browser": "httpx",
                "host": host,
                "ignore_https_errors": ignore_https_errors,
                "referer": referer,
                "timeout": timeout,
                "user_agent": user_agent,
            }

            url = str(res.url)
            status = res.status_code
            body = res.text
            sha256 = calculate_sha256(body)
            headers = {k.lower(): v for (k, v) in res.headers.items()}
        except httpx.HTTPError as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = ""

        # get scripts
        scripts = cast(List[Script], await
                       ScriptTask.process(snapshot, insert_to_db=False))

        return SnapshotResult(screenshot=screenshot,
                              snapshot=snapshot,
                              scripts=scripts)
Ejemplo n.º 7
0
    async def take_snapshot(
        url: str,
        accept_language: Optional[str] = None,
        ignore_https_errors: bool = False,
        referer: Optional[str] = None,
        timeout: Optional[int] = None,
        user_agent: Optional[str] = None,
    ) -> SnapshotResult:
        """Take a snapshot of a website by puppeteer

        Arguments:
            url {str} -- A URL of a website

        Keyword Arguments:
            accept_language {Optional[str]} -- Accept-language header to use (default: {None})
            ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False})
            referer {Optional[str]} -- Referer header to use (default: {None})
            timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None})
            user_agent {Optional[str]} -- User-agent header to use (default: {None})

        Returns:
            SnapshotResult
        """
        submitted_url: str = url
        try:
            async with async_playwright() as p:
                browser: playwright.browser.Browser = await launch_browser(p)
                page: Page = await browser.newPage(
                    ignoreHTTPSErrors=ignore_https_errors,
                    userAgent=user_agent)

                headers = {}
                if accept_language is not None:
                    headers["Accept-Language"] = accept_language
                await page.setExtraHTTPHeaders(headers)

                # intercept responses on page to get scripts
                scripts: List[Script] = []

                async def handle_response(response: Response) -> None:
                    content_type: str = response.headers.get(
                        "content-type", "")
                    if response.ok and is_js_content_type(content_type):
                        content = await response.text()
                        scripts.append(
                            Script(
                                url=response.url,
                                content=content,
                                sha256=calculate_sha256(content),
                            ))

                page.on(
                    "response",
                    lambda response: asyncio.create_task(
                        handle_response(response)),
                )

                # default timeout = 30 seconds
                timeout = timeout or 30 * 1000
                res: Response = await page.goto(
                    url,
                    referer=referer,
                    timeout=timeout,
                    waitUntil=settings.BROWSER_WAIT_UNTIL,
                )

                request = {
                    "accept_language": accept_language,
                    "browser": browser.version,
                    "ignore_https_errors": ignore_https_errors,
                    "referer": referer,
                    "timeout": timeout,
                    "user_agent": await
                    page.evaluate("() => navigator.userAgent"),
                }

                url = page.url
                status = res.status
                screenshot_data = await page.screenshot()
                body = await page.content()
                sha256 = calculate_sha256(body)
                headers = res.headers

                await browser.close()
        except Error as e:
            raise (e)

        server = headers.get("server")
        content_type = headers.get("content-type")
        content_length = headers.get("content-length")

        hostname = cast(str, get_hostname_from_url(url))
        certificate = Certificate.load_and_dump_from_url(url)
        ip_address = cast(str, get_ip_address_by_hostname(hostname))
        asn = get_asn_by_ip_address(ip_address) or ""
        whois = Whois.whois(hostname)

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=status,
            body=body,
            sha256=sha256,
            headers=headers,
            hostname=hostname,
            ip_address=ip_address,
            asn=asn,
            server=server,
            content_length=content_length,
            content_type=content_type,
            whois=whois,
            certificate=certificate,
            request=request,
        )
        screenshot = Screenshot()
        screenshot.data = base64.b64encode(screenshot_data).decode()

        return SnapshotResult(
            screenshot=screenshot,
            snapshot=snapshot,
            scripts=scripts,
        )
Ejemplo n.º 8
0
async def mock_preview(hostname: str):
    s = Screenshot()
    s.data = ""
    return s
Ejemplo n.º 9
0
    async def import_as_snapshot(cls, uuid: str) -> SnapshotResult:
        """Import urlscan.io scan as a snapshot

        Arguments:
            uuid {str} -- Scan ID

        Returns:
            Snapshot -- Snapshot ORM instance
        """
        instance = cls(uuid)
        result = await instance.result()

        requests = result.get("data", {}).get("requests", [])
        response = {}
        for request in requests:
            tmp = request.get("response", {}).get("response", {})
            if tmp.get("status") == 200:
                response = tmp
                break

        url = result.get("page", {}).get("url")
        submitted_url = result.get("task", {}).get("url")
        hostname = result.get("page", {}).get("domain")
        ip_address = result.get("page", {}).get("ip")
        asn = result.get("page", {}).get("asn")
        asnname = result.get("page", {}).get("asnname")

        headers = response.get("headers", {})
        server = result.get("page", {}).get("server")
        content_type = headers.get("Content-Type") or headers.get(
            "content-type")
        content_length = headers.get("Content-Length") or headers.get(
            "content-length")

        body = await instance.body()
        sha256 = result.get("lists", {}).get("hashes", [])[0]
        time = cast(str, result.get("task", {}).get("time"))
        created_at = datetime.datetime.strptime(time, "%Y-%m-%dT%H:%M:%S.%fZ")

        snapshot = Snapshot(
            url=url,
            submitted_url=submitted_url,
            status=200,
            hostname=hostname,
            ip_address=ip_address,
            asn=f"{asn} {asnname}",
            server=server,
            content_type=content_type,
            content_length=content_length,
            headers=headers,
            body=body,
            sha256=sha256,
            created_at=created_at,
            request={"urlscan.io": uuid},
        )

        data = await instance.screenshot()
        screenshot = Screenshot()
        screenshot.data = data

        return SnapshotResult(screenshot=screenshot,
                              snapshot=snapshot,
                              scripts=[])