Python urlClean Examples, common.util.urlFuncs.urlClean Python Examples

Example #1

0

Show file

    def processImageLink(self, url, baseUrl):

        # Skip tags with `img src=""`.
        # No idea why they're there, but they are
        if not url:
            return None
        if url is None:
            return None

        # # Filter by domain
        # if not self.allImages and not any([base in url for base in self._fileDomains]):
        # 	return

        # and by blocked words
        hadbad = False
        urll = url.lower()
        for badword in self._badwords:
            if badword.lower() in urll:
                hadbad = True
        if hadbad:
            return None

        url = urlFuncs.urlClean(url)

        # urlClean can return none for URLs pointing to garbage squatters and some other contexts.
        if url is None:
            return None

        return self.processNewUrl(url, baseUrl=baseUrl, istext=False)

Example #2

0

Show file

File: ProcessorBase.py Project: fake-name/ReadableWebProxy

	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url

Example #3

0

Show file

	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url

Example #4

0

Show file

File: Fetch.py Project: fake-name/ReadableWebProxy

	def fetch(self, preretrieved):

		if not preretrieved:
			self.target_url = url_util.urlClean(self.target_url)
			content, fName, mimeType = self.getItem(self.target_url)
		else:
			content, fName, mimeType = preretrieved

		return self.dispatchContent(content, fName, mimeType)

Example #5

0

Show file

    def processLinkItem(self, url, baseUrl):

        url = urlFuncs.cleanUrl(url)
        if not url:
            return None

        # F*****g tumblr redirects.
        if url.startswith("https://www.tumblr.com/login"):
            return None

        for badword in self._badwords:
            if badword in url:
                return

        for badword in self._badwords:
            if badword in url:
                return

        url = urlFuncs.urlClean(url)

        if not url:
            return None

        if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
            url = urlFuncs.trimGDocUrl(url)

            if url.startswith('https://docs.google.com/document/d/images'):
                return

            # self.log.info("Resolved URL = '%s'", url)
            ret = self.processNewUrl(url, baseUrl)
            return ret
            # self.log.info("New G link: '%s'", url)

        else:
            # Remove any URL fragments causing multiple retreival of the same resource.
            if url != urlFuncs.trimGDocUrl(url):
                print('Old URL: "%s"' % url)
                print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
                raise ValueError("Wat? Url change? Url: '%s'" % url)
            ret = self.processNewUrl(url, baseUrl)
            # print("Returning:", ret)
            return ret

Example #6

0

Show file

    def fetch(self, preretrieved):

        if not preretrieved:
            self.target_url = url_util.urlClean(self.target_url)
            content, fName, mimeType = self.getItem(self.target_url)
        else:
            content, fName, mimeType = preretrieved

        started_at = time.time()
        ret = self.dispatchContent(content, fName, mimeType)

        fetchtime = (time.time() - started_at) * 1000

        cleaned_mime = mimeType
        for replace in ['/', '\\', ':', '.']:
            cleaned_mime = cleaned_mime.replace(replace, "-")

        self.mon_con.timing("{}".format(cleaned_mime), fetchtime)

        return ret

Example #7

0

Show file

File: ProcessorBase.py Project: fake-name/ReadableWebProxy

	def processLinkItem(self, url, baseUrl):

		url = urlFuncs.cleanUrl(url)
		if not url:
			return None

		# F*****g tumblr redirects.
		if url.startswith("https://www.tumblr.com/login"):
			return None

		for badword in self._badwords:
			if badword in url:
				return

		for badword in self._badwords:
			if badword in url:
				return

		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret

Example #8

0

Show file

    def processImageLink(self, url, baseUrl):

        # Skip tags with `img src=""`.
        # No idea why they're there, but they are
        if not url:
            return

        # # Filter by domain
        # if not self.allImages and not any([base in url for base in self._fileDomains]):
        # 	return

        # # and by blocked words
        # hadbad = False
        # for badword in self._badwords:
        # 	if badword.lower() in url.lower():
        # 		hadbad = True
        # if hadbad:
        # 	return

        url = urlFuncs.urlClean(url)

        return self.processNewUrl(url, baseUrl=baseUrl, istext=False)

Example #9

0

Show file

File: HtmlProcessor.py Project: fake-name/ReadableWebProxy

	def processImageLink(self, url, baseUrl):

		# Skip tags with `img src=""`.
		# No idea why they're there, but they are
		if not url:
			return

		# # Filter by domain
		# if not self.allImages and not any([base in url for base in self._fileDomains]):
		# 	return

		# # and by blocked words
		# hadbad = False
		# for badword in self._badwords:
		# 	if badword.lower() in url.lower():
		# 		hadbad = True
		# if hadbad:
		# 	return


		url = urlFuncs.urlClean(url)

		return self.processNewUrl(url, baseUrl=baseUrl, istext=False)

Example #10

0

Show file

 def convertToReaderImage(self, inStr):
     inStr = urlFuncs.urlClean(inStr)
     return self.convertToReaderUrl(inStr, resource=True)

Example #11

0

Show file

File: ProcessorBase.py Project: fake-name/ReadableWebProxy

	def convertToReaderImage(self, inStr):
		inStr = urlFuncs.urlClean(inStr)
		return self.convertToReaderUrl(inStr, resource=True)