def convertToReaderUrl(self, inUrl, resource=False): inUrl = urlFuncs.urlClean(inUrl) inUrl = self.preprocessReaderUrl(inUrl) # The link will have been canonized at this point # Do not relink inline images if inUrl.startswith("data:"): return inUrl # or links that are NOP()ed with javascript if inUrl.startswith("javascript:void(0);"): return inUrl # Fix protocol-relative URLs if inUrl.startswith("//"): if hasattr(self, "pageUrl"): scheme = urllib.parse.urlsplit(self.pageUrl).scheme else: self.log.warning("No pageUrl member variable? Guessing about the protocol type!") scheme = "http" inUrl = "{}:{}".format(scheme, inUrl) if resource: prefix = "RESOURCE:{}".format(config.relink_secret) else: prefix = "CONTENT:{}".format(config.relink_secret) url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl)) return url
def fetch(self): self.target_url = url_util.urlClean(self.target_url) content, fName, mimeType = self.getItem(self.target_url) return self.dispatchContent(content, fName, mimeType)
def processLinkItem(self, url, baseUrl): url = urlFuncs.clearOutboundProxy(url) url = urlFuncs.clearBitLy(url) for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def fetch(self, preretrieved): if not preretrieved: self.target_url = url_util.urlClean(self.target_url) content, fName, mimeType = self.getItem(self.target_url) else: content, fName, mimeType = preretrieved return self.dispatchContent(content, fName, mimeType)
def processLinkItem(self, url, baseUrl): url = urlFuncs.cleanUrl(url) if not url: return None # F*****g tumblr redirects. if url.startswith("https://www.tumblr.com/login"): return None for badword in self._badwords: if badword in url: return for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def processImageLink(self, url, baseUrl): # Skip tags with `img src=""`. # No idea why they're there, but they are if not url: return # # Filter by domain # if not self.allImages and not any([base in url for base in self._fileDomains]): # return # # and by blocked words # hadbad = False # for badword in self._badwords: # if badword.lower() in url.lower(): # hadbad = True # if hadbad: # return url = urlFuncs.urlClean(url) return self.processNewUrl(url, baseUrl=baseUrl, istext=False)
def convertToReaderImage(self, inStr): inStr = urlFuncs.urlClean(inStr) return self.convertToReaderUrl(inStr, resource=True)
def convertToReaderUrl(self, inUrl): inUrl = urlFuncs.urlClean(inUrl) inUrl = self.preprocessReaderUrl(inUrl) # The link will have been canonized at this point url = '/books/render?url=%s' % urllib.parse.quote(inUrl) return url