def processNewUrl(self, url, baseUrl=None, istext=True): if not url.lower().startswith("http"): if baseUrl: # If we have a base-url to extract the scheme from, we pull that out, concatenate # it onto the rest of the url segments, and then unsplit that back into a full URL scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme rest = urllib.parse.urlsplit(baseUrl.lower())[1:] params = (scheme, ) + rest # self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url) url = urllib.parse.urlunsplit(params) elif self.ignoreBadLinks: self.log.error("Skipping a malformed URL!") self.log.error("Bad URL: '%s'", url) return else: raise ValueError("Url isn't a url: '%s'" % url) if gdp.isGdocUrl(url) or gdp.isGFileUrl(url): if gdp.trimGDocUrl(url) != url: raise ValueError("Invalid link crept through! Link: '%s'" % url) if not url.lower().startswith('http'): raise ValueError("Failure adding scheme to URL: '%s'" % url) if not self.checkDomain(url) and istext: raise ValueError("Invalid url somehow got through: '%s'" % url) if '/view/export?format=zip' in url: raise ValueError("Wat?") return url
def extractGoogleDriveFolder(self, driveUrl): ''' Extract all the relevant links from a google drive directory, and push them into the queued URL queue. ''' newLinks = [] self.log.info("Fetching drive container page") docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl) # print('docReferences', docReferences) for dummy_title, url in docReferences: url = gdp.trimGDocUrl(url) if url not in newLinks: newLinks.append(url) self.log.info("Generating google drive disambiguation page!") soup = gdp.makeDriveDisambiguation(docReferences, pgTitle) # print(disamb) soup = self.relink(soup) disamb = soup.prettify() ret = {} ret['contents'] = disamb ret['title'] = pgTitle ret['plainLinks'] = newLinks ret['rsrcLinks'] = [] # drive folders don't have resources self.log.info("Found %s items in google drive directory", len(docReferences)) return ret
def processLinkItem(self, url, baseUrl): url = gdp.clearOutboundProxy(url) url = gdp.clearBitLy(url) # Filter by domain if not self.checkDomain(url): # print("Filtering", self.checkDomain(url), url) return # and by blocked words for badword in self._badwords: if badword in url: # print("hadbad", self.checkDomain(url), url) return if not self.checkFollowGoogleUrl(url): return url = urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = gdp.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != gdp.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % gdp.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def processLinkItem(self, url, baseUrl): url = gdp.clearOutboundProxy(url) url = gdp.clearBitLy(url) # Filter by domain if not self.checkDomain(url): # print("Filtering", self.checkDomain(url), url) return # and by blocked words for badword in self._badwords: if badword in url: # print("hadbad", self.checkDomain(url), url) return if not self.checkFollowGoogleUrl(url): return url = TextScrape.urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = gdp.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) return self.processNewUrl(url, baseUrl) # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != gdp.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % gdp.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) return self.processNewUrl(url, baseUrl)
def urlClean(url): # Google docs can be accessed with or without the '/preview' postfix # We want to remove this if it's present, so we don't duplicate content. url = gdp.trimGDocUrl(url) while True: url2 = urllib.parse.unquote(url) url2 = url2.split("#")[0] if url2 == url: break url = url2 # Clean off whitespace. url = url.strip() return url
def relink(self, soup, imRelink=None): # The google doc reader relinking mechanisms requires overriding the # image relinking mechanism. As such, allow that to be overridden # if needed # print("relink call!") # print(self._relinkDomains) if not imRelink: imRelink = self.convertToReaderImage for (isImg, tag, attr) in urlFuncs.urlContainingTargets: if not isImg: for link in soup.findAll(tag): try: # print("Link!", self.checkRelinkDomain(link[attr]), link[attr]) if self.checkRelinkDomain(link[attr]): link[attr] = self.convertToReaderUrl(link[attr]) if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc: link[attr] = gdp.trimGDocUrl(link[attr]) # print("Relinked", link[attr]) except KeyError: continue else: for link in soup.findAll(tag): try: link[attr] = imRelink(link[attr]) if tag == 'img': # Force images that are oversize to fit the window. link["style"] = 'max-width: 95%;' if 'width' in link.attrs: del link.attrs['width'] if 'height' in link.attrs: del link.attrs['height'] except KeyError: continue return soup
def processGdocPage(self, url, content): dummy_fName, content = content print("Page size: ", len(content)) soup = bs4.BeautifulSoup(content) TextScrape.urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = gdp.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody