def processImageLink(self, url, baseUrl): # Skip tags with `img src=""`. # No idea why they're there, but they are if not url: return None if url is None: return None # # Filter by domain # if not self.allImages and not any([base in url for base in self._fileDomains]): # return # and by blocked words hadbad = False urll = url.lower() for badword in self._badwords: if badword.lower() in urll: hadbad = True if hadbad: return None url = urlFuncs.urlClean(url) # urlClean can return none for URLs pointing to garbage squatters and some other contexts. if url is None: return None return self.processNewUrl(url, baseUrl=baseUrl, istext=False)
def convertToReaderUrl(self, inUrl, resource=False): inUrl = urlFuncs.urlClean(inUrl) inUrl = self.preprocessReaderUrl(inUrl) # The link will have been canonized at this point # Do not relink inline images if inUrl.startswith("data:"): return inUrl # or links that are NOP()ed with javascript if inUrl.startswith("javascript:void(0);"): return inUrl # Fix protocol-relative URLs if inUrl.startswith("//"): if hasattr(self, "pageUrl"): scheme = urllib.parse.urlsplit(self.pageUrl).scheme else: self.log.warning("No pageUrl member variable? Guessing about the protocol type!") scheme = "http" inUrl = "{}:{}".format(scheme, inUrl) if resource: prefix = "RESOURCE:{}".format(config.relink_secret) else: prefix = "CONTENT:{}".format(config.relink_secret) url = '%s%s' % (prefix.lower(), urllib.parse.quote(inUrl)) return url
def fetch(self, preretrieved): if not preretrieved: self.target_url = url_util.urlClean(self.target_url) content, fName, mimeType = self.getItem(self.target_url) else: content, fName, mimeType = preretrieved return self.dispatchContent(content, fName, mimeType)
def processLinkItem(self, url, baseUrl): url = urlFuncs.cleanUrl(url) if not url: return None # F*****g tumblr redirects. if url.startswith("https://www.tumblr.com/login"): return None for badword in self._badwords: if badword in url: return for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if not url: return None if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def fetch(self, preretrieved): if not preretrieved: self.target_url = url_util.urlClean(self.target_url) content, fName, mimeType = self.getItem(self.target_url) else: content, fName, mimeType = preretrieved started_at = time.time() ret = self.dispatchContent(content, fName, mimeType) fetchtime = (time.time() - started_at) * 1000 cleaned_mime = mimeType for replace in ['/', '\\', ':', '.']: cleaned_mime = cleaned_mime.replace(replace, "-") self.mon_con.timing("{}".format(cleaned_mime), fetchtime) return ret
def processLinkItem(self, url, baseUrl): url = urlFuncs.cleanUrl(url) if not url: return None # F*****g tumblr redirects. if url.startswith("https://www.tumblr.com/login"): return None for badword in self._badwords: if badword in url: return for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def processImageLink(self, url, baseUrl): # Skip tags with `img src=""`. # No idea why they're there, but they are if not url: return # # Filter by domain # if not self.allImages and not any([base in url for base in self._fileDomains]): # return # # and by blocked words # hadbad = False # for badword in self._badwords: # if badword.lower() in url.lower(): # hadbad = True # if hadbad: # return url = urlFuncs.urlClean(url) return self.processNewUrl(url, baseUrl=baseUrl, istext=False)
def convertToReaderImage(self, inStr): inStr = urlFuncs.urlClean(inStr) return self.convertToReaderUrl(inStr, resource=True)