Python urlClean 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: WebMirror.util.urlFuncs

메소드/함수: urlClean

hotexamples.com에서의 예제들: 11

Python urlClean - 11개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 WebMirror.util.urlFuncs.urlClean에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

예제 #1

파일 보기

파일: ProcessorBase.py 프로젝트: MyAnimeDays/ReadableWebProxy

	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url

예제 #2

파일 보기

파일: Fetch.py 프로젝트: MyAnimeDays/ReadableWebProxy

	def fetch(self):
		self.target_url = url_util.urlClean(self.target_url)


		content, fName, mimeType = self.getItem(self.target_url)

		return self.dispatchContent(content, fName, mimeType)

예제 #3

파일 보기

	def convertToReaderUrl(self, inUrl, resource=False):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point

		# Do not relink inline images
		if inUrl.startswith("data:"):
			return inUrl

		# or links that are NOP()ed with javascript
		if inUrl.startswith("javascript:void(0);"):
			return inUrl


		# Fix protocol-relative URLs
		if inUrl.startswith("//"):
			if hasattr(self, "pageUrl"):
				scheme = urllib.parse.urlsplit(self.pageUrl).scheme
			else:
				self.log.warning("No pageUrl member variable? Guessing about the protocol type!")
				scheme = "http"
			inUrl = "{}:{}".format(scheme, inUrl)

		if resource:
			prefix = "RESOURCE:{}".format(config.relink_secret)
		else:
			prefix = "CONTENT:{}".format(config.relink_secret)
		url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl))
		return url

예제 #4

파일 보기

파일: ProcessorBase.py 프로젝트: GodOfConquest/ReadableWebProxy

	def processLinkItem(self, url, baseUrl):
		url = urlFuncs.clearOutboundProxy(url)
		url = urlFuncs.clearBitLy(url)

		for badword in self._badwords:
			if badword in url:
				return


		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret

예제 #5

파일 보기

파일: Fetch.py 프로젝트: bradparks/ReadableWebProxy

    def fetch(self, preretrieved):

        if not preretrieved:
            self.target_url = url_util.urlClean(self.target_url)
            content, fName, mimeType = self.getItem(self.target_url)
        else:
            content, fName, mimeType = preretrieved

        return self.dispatchContent(content, fName, mimeType)

예제 #6

파일 보기

	def processLinkItem(self, url, baseUrl):

		url = urlFuncs.cleanUrl(url)
		if not url:
			return None

		# F*****g tumblr redirects.
		if url.startswith("https://www.tumblr.com/login"):
			return None

		for badword in self._badwords:
			if badword in url:
				return

		for badword in self._badwords:
			if badword in url:
				return

		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret

예제 #7

파일 보기

    def processImageLink(self, url, baseUrl):

        # Skip tags with `img src=""`.
        # No idea why they're there, but they are
        if not url:
            return

        # # Filter by domain
        # if not self.allImages and not any([base in url for base in self._fileDomains]):
        # 	return

        # # and by blocked words
        # hadbad = False
        # for badword in self._badwords:
        # 	if badword.lower() in url.lower():
        # 		hadbad = True
        # if hadbad:
        # 	return

        url = urlFuncs.urlClean(url)

        return self.processNewUrl(url, baseUrl=baseUrl, istext=False)

예제 #8

파일 보기

파일: HtmlProcessor.py 프로젝트: MyAnimeDays/ReadableWebProxy

	def processImageLink(self, url, baseUrl):

		# Skip tags with `img src=""`.
		# No idea why they're there, but they are
		if not url:
			return

		# # Filter by domain
		# if not self.allImages and not any([base in url for base in self._fileDomains]):
		# 	return

		# # and by blocked words
		# hadbad = False
		# for badword in self._badwords:
		# 	if badword.lower() in url.lower():
		# 		hadbad = True
		# if hadbad:
		# 	return


		url = urlFuncs.urlClean(url)

		return self.processNewUrl(url, baseUrl=baseUrl, istext=False)

예제 #9

파일 보기

파일: ProcessorBase.py 프로젝트: MyAnimeDays/ReadableWebProxy

	def convertToReaderImage(self, inStr):
		inStr = urlFuncs.urlClean(inStr)
		return self.convertToReaderUrl(inStr, resource=True)

예제 #10

파일 보기

파일: ProcessorBase.py 프로젝트: GodOfConquest/MangaCMS

	def convertToReaderUrl(self, inUrl):
		inUrl = urlFuncs.urlClean(inUrl)
		inUrl = self.preprocessReaderUrl(inUrl)
		# The link will have been canonized at this point
		url = '/books/render?url=%s' % urllib.parse.quote(inUrl)
		return url

예제 #11

파일 보기

	def convertToReaderImage(self, inStr):
		inStr = urlFuncs.urlClean(inStr)
		return self.convertToReaderUrl(inStr, resource=True)