Esempio n. 1
0
    def test_html_detect(self):
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO('<html><body>hi</body></html>'.encode('utf-16le'))
        ))
        self.assertFalse(HTMLReader.is_file(
            io.BytesIO('hello world!'.encode('utf-16le'))
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<title>hello</title>hi')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(b'<html><body>hello')
        ))
        self.assertTrue(HTMLReader.is_file(
            io.BytesIO(
                b'The document has moved <a href="somewhere.html">here</a>'
            )
        ))
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.htm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.html'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.dhtm'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xhtml'))
        )
        self.assertTrue(
            HTMLReader.is_url(URLInfo.parse('example.com/index.xht'))
        )
        self.assertFalse(
            HTMLReader.is_url(URLInfo.parse('example.com/image.jpg'))
        )
        self.assertTrue(
            HTMLReader.is_request(Request('example.com/index.html'))
        )
        self.assertFalse(
            HTMLReader.is_request(Request('example.com/image.jpg'))
        )

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'text/html'
        self.assertTrue(HTMLReader.is_response(response))

        response = Response(200, 'OK')
        response.fields['Content-Type'] = 'image/png'
        self.assertFalse(HTMLReader.is_response(response))
Esempio n. 2
0
    def scrape_document(self, item_session):
        response = item_session.response
        url_info = item_session.request.url_info
        url = url_info.raw

        if response_body_size(response) < 30 * 1024 * 1024:
            dupes_db = self.dupes_db
            body = response.body.content()
            if HTMLReader.is_response(response):
                body = dupespotter.process_body(body, url)
            digest = hashlib.md5(body).digest()
            if dupes_db is not None:
                dupe_of = dupes_db.get_old_url(digest)
            else:
                dupe_of = None
            if dupe_of is not None:
                # Don't extract links from pages we've already seen
                # to avoid loops that descend a directory endlessly
                print("DUPE {}\n  OF {}".format(url, dupe_of))
                return
            else:
                if dupes_db is not None:
                    dupes_db.set_old_url(digest, url)

        super().scrape_document(item_session)
Esempio n. 3
0
    def _append_filename_extension(self, response):
        '''Append an HTML/CSS file suffix as needed.'''
        if not self._filename:
            return

        if response.request.url_info.scheme not in ('http', 'https'):
            return

        if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \
                HTMLReader.is_response(response):
            self._filename += '.html'
        elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \
                CSSReader.is_response(response):
            self._filename += '.css'
Esempio n. 4
0
    def _append_filename_extension(self, response: BaseResponse):
        '''Append an HTML/CSS file suffix as needed.'''
        if not self._filename:
            return

        if response.request.url_info.scheme not in ('http', 'https'):
            return

        if not re.search(r'\.[hH][tT][mM][lL]?$', self._filename) and \
                HTMLReader.is_response(response):
            self._filename += '.html'
        elif not re.search(r'\.[cC][sS][sS]$', self._filename) and \
                CSSReader.is_response(response):
            self._filename += '.css'
Esempio n. 5
0
	def scrape_document(self, request, response, url_item):
		if response.body.size() < 30*1024*1024:
			dupes_db = self.dupes_db
			body = response.body.content()
			if HTMLReader.is_response(response):
				body = dupespotter.process_body(body, response.request.url)
			digest = hashlib.md5(body).digest()
			if dupes_db is not None:
				dupe_of = dupes_db.get_old_url(digest)
			else:
				dupe_of = None
			if dupe_of is not None:
				# Don't extract links from pages we've already seen
				# to avoid loops that descend a directory endlessly
				print("  DUPE {}\n      OF {}".format(response.request.url, dupe_of))
				return
			else:
				if dupes_db is not None:
					dupes_db.set_old_url(digest, response.request.url)

		super().scrape_document(request, response, url_item)
Esempio n. 6
0
    def scrape_document(self, request, response, url_item):
        if response.body.size() < 30*1024*1024:
            dupes_db = self.dupes_db
            body = response.body.content()
            if HTMLReader.is_response(response):
                body = archivebot.dupespotter.dupespotter.process_body(body, response.request.url)

            digest = hashlib.md5(body).digest()
            if dupes_db is not None:
                dupe_of = dupes_db.get_old_url(digest)
            else:
                dupe_of = None
            if dupe_of is not None:
                # Don't extract links from pages we've already seen
                # to avoid loops that descend a directory endlessly
                print("  DUPE {}\n      OF {}".format(response.request.url, dupe_of))
                sys.stdout.flush()
                return
            else:
                if dupes_db is not None:
                    dupes_db.set_old_url(digest, response.request.url)

        super().scrape_document(request, response, url_item)