Example #1
0
 def test_should_extract_images_from_srcset(self):
     html = """
     <html><body><img srcset="image-320w.jpg 320w,
          image-480w.jpg 480w,
          image-800w.jpg 800w"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource='image-320w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='image-480w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='image-800w.jpg',
                 kind='img-srcset',
                 initiator=self.test_url,
             ),
         ],
         extract_assets(soup, self.test_url),
     )
Example #2
0
 def test_should_extract_urls_from_video(self):
     html = """<html><body>
     <video controls
            src="https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4"
            poster="https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217"
            width="620"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource=
                 'https://archive.org/download/BigBuckBunny_124/Content/big_buck_bunny_720p_surround.mp4',
                 kind='video-src',
                 initiator=self.test_url,
             ),
             Asset(
                 resource=
                 'https://peach.blender.org/wp-content/uploads/title_anouncement.jpg?x11217',
                 kind='video-poster',
                 initiator=self.test_url,
             ),
         ],
         extract_assets(soup, self.test_url),
     )
Example #3
0
def perform_scan(url: str, permitted_domains: List[str]) -> ScanResult:
    scan_data = {
        'live': False,
        'landing_page_url': url,
    }

    try:
        page, soup = request_and_scrape_page(url)

    except requests.exceptions.RequestException:
        # Connection timed out, an invalid HTTP response was returned, or
        # a network problem occurred.
        # Catch the base class exception for these cases.
        scan_data['http_status_200_ok'] = False
        return ScanResult(**scan_data)

    http_response_data = parse_page_data(page)
    scan_data.update(http_response_data)

    content_data = parse_soup_data(soup)
    scan_data.update(content_data)

    assets = extract_assets(soup, page.url)
    asset_results = parse_assets(assets, [tldextract.extract(page.url).registered_domain] + permitted_domains)
    scan_data.update(asset_results)

    pshtt_results = inspect_domains([url_to_domain(page.url)], {'timeout': 10})

    https_data = parse_pshtt_data(pshtt_results[0])
    scan_data.update(https_data)

    return ScanResult(**scan_data)
Example #4
0
 def test_should_extract_urls_from_embeds(self):
     html = """<html><body><embed type="video/quicktime" src="movie.mov" width="640" height="480"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='movie.mov',
                   kind='embed-src',
                   initiator=self.test_url)
         ],
         extract_assets(soup, self.test_url),
     )
Example #5
0
 def test_should_extract_urls_from_audio(self):
     html = """<html><body><audio src="audio.wav"></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='audio.wav',
                   kind='audio-src',
                   initiator=self.test_url)
         ],
         extract_assets(soup, self.test_url),
     )
Example #6
0
 def test_should_extract_urls_in_inline_css(self):
     html = """<html>
     <body style="background-image: url('https://example.org/files/example.png')"></body></html>"""
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(
                 resource='https://example.org/files/example.png',
                 kind='style-resource-inline',
                 initiator=self.test_url,
             )
         ],
         extract_assets(soup, self.test_url),
     )
Example #7
0
    def test_should_extract_images(self):
        html = """
        <html><body><img src="image.jpg"></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(resource='image.jpg',
                      kind='img-src',
                      initiator=self.test_url)
            ],
            extract_assets(soup, self.test_url),
        )
Example #8
0
 def test_should_extract_embedded_scripts_with_urls(self):
     html = """
     <html><head><script>var url = 'http://www.example.org';</script></head><body></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         extract_assets(soup, self.test_url),
         [
             Asset(
                 resource='http://www.example.org',
                 kind='script-embed',
                 initiator=self.test_url,
             )
         ],
     )
Example #9
0
    def test_should_extract_external_scripts(self, mock_requests):
        mock_requests.get.return_value = mock.Mock(text='')
        html = """
        <html><head><script src="script.js"></head><body></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(resource='script.js',
                      kind='script-src',
                      initiator=self.test_url)
            ],
            extract_assets(soup, self.test_url),
        )
Example #10
0
    def test_should_extract_urls_from_iframes(self):
        html = """
        <html><body><iframe src="https://www.example.org/embed.html"></iframe></body></html>
        """
        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='https://www.example.org/embed.html',
                    kind='iframe-src',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )
Example #11
0
    def test_should_extract_links_to_stylesheets(self, mock_requests):
        html = """
        <html><head><link href="/media/example.css" rel="stylesheet"></head><body></body></html>
        """

        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='/media/example.css',
                    kind='style-href',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )
Example #12
0
    def test_should_extract_urls_in_embedded_css(self):
        html = """<html><head><style>
        div {
          background-image: url("https://example.org/files/example.png");
        }
        </style></head><body></body></html>"""

        soup = BeautifulSoup(html, "lxml")

        self.assertEqual(
            [
                Asset(
                    resource='https://example.org/files/example.png',
                    kind='style-embed',
                    initiator=self.test_url,
                )
            ],
            extract_assets(soup, self.test_url),
        )
Example #13
0
 def test_should_extract_urls_from_sources(self):
     html = """<html><body>
     <video>
     <source src="video.webm" type="video/webm">
     <source src="video.ogg" type="video/ogg">
     <source src="video.mov" type="video/quicktime">
     </video></body></html>
     """
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         [
             Asset(resource='video.webm',
                   kind='source-src',
                   initiator=self.test_url),
             Asset(resource='video.ogg',
                   kind='source-src',
                   initiator=self.test_url),
             Asset(resource='video.mov',
                   kind='source-src',
                   initiator=self.test_url),
         ],
         extract_assets(soup, self.test_url),
     )
Example #14
0
    def test_should_extract_urls_in_external_js(self, mock_requests):
        mock_requests.get.return_value = mock.Mock(
            text=
            """function makeRequest() { $.getJSON('http://example.org/', function(data) {}); }"""
        )

        html = """
        <html><head><script src="file.js""></head><body></body></html>
        """
        soup = BeautifulSoup(html, "lxml")
        self.assertEqual(
            [
                Asset(resource='file.js',
                      kind='script-src',
                      initiator=self.test_url),
                Asset(
                    resource='http://example.org/',
                    kind='script-resource',
                    initiator='file.js',
                ),
            ],
            extract_assets(soup, self.test_url),
        )
Example #15
0
 def test_should_extract_urls_in_linked_css(self, requests_mock):
     requests_mock.get.return_value = mock.Mock(
         text=
         'selector { background-image: url("https://example.org/example.png") }'
     )
     html = """
     <html><head><link href="https://example.org/styles.css" rel="stylesheet"></head><body></body></html>"""
     soup = BeautifulSoup(html, "lxml")
     self.assertEqual(
         set(extract_assets(soup, self.test_url)),
         {
             Asset(
                 resource='https://example.org/styles.css',
                 kind='style-href',
                 initiator=self.test_url,
             ),
             Asset(
                 resource='https://example.org/example.png',
                 kind='style-resource',
                 initiator='https://example.org/styles.css',
             ),
         },
     )