def getDriveFileUrls(cls, url): ctnt, handle = cls.wg.getpage(url, returnMultiple=True) # Pull out the title for the disambiguation page. soup = common.util.webFunctions.as_soup(ctnt) title = soup.title.string # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL, # which tells us if we redirected to a plain google doc, and just return that if the redirect occured. handleUrl = handle.geturl() if handleUrl != url: if urlFuncs.isGdocUrl(handleUrl): cls.log.info("Direct read redirect: '%s'", handleUrl) handleUrl = urlFuncs.trimGDocUrl(handleUrl) return [(title, handleUrl)], title jsRe = re.compile('var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL) items = jsRe.findall(ctnt) assert len(items) == 1 data = '{cont}'.format(cont=items.pop().strip()) conf = jsLiteralParse.jsParse(data) # The keys+data in the data/conf are: # 'folderName' - Title of the folder, just a string # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item. # Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for # Every file, even if they're different docs types. # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems' assert 'viewerItems' in conf assert 'folderName' in conf title = conf['folderName'] pages = conf['viewerItems'] items = [] for page in pages: if len(page) != 18 and len(page) != 22: cls.log.error("json entry in page with an invalid length:") cls.log.error("%s", page) continue # Item 2 is the title, item 17 is the doc URL # The doc URL is unicode escaped, annoyingly itemTitle = page[2] itemUrl = page[17].encode('ascii').decode('unicode_escape') itemUrl = urlFuncs.trimGDocUrl(itemUrl) items.append((itemTitle, itemUrl)) return items, title
def getDriveFileUrls(cls, url): ctnt, handle = cls.wg.getpage(url, returnMultiple=True) # Pull out the title for the disambiguation page. soup = WebRequest.as_soup(ctnt) title = soup.title.string # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL, # which tells us if we redirected to a plain google doc, and just return that if the redirect occured. handleUrl = handle.geturl() if handleUrl != url: if urlFuncs.isGdocUrl(handleUrl): cls.log.info("Direct read redirect: '%s'", handleUrl) handleUrl = urlFuncs.trimGDocUrl(handleUrl) return [(title, handleUrl)], title jsRe = re.compile( 'var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL) items = jsRe.findall(ctnt) assert len(items) == 1 data = '{cont}'.format(cont=items.pop().strip()) conf = jsLiteralParse.jsParse(data) # The keys+data in the data/conf are: # 'folderName' - Title of the folder, just a string # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item. # Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for # Every file, even if they're different docs types. # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems' assert 'viewerItems' in conf assert 'folderName' in conf title = conf['folderName'] pages = conf['viewerItems'] items = [] for page in pages: if len(page) != 18 and len(page) != 22: cls.log.error("json entry in page with an invalid length:") cls.log.error("%s", page) continue # Item 2 is the title, item 17 is the doc URL # The doc URL is unicode escaped, annoyingly itemTitle = page[2] itemUrl = page[17].encode('ascii').decode('unicode_escape') itemUrl = urlFuncs.trimGDocUrl(itemUrl) items.append((itemTitle, itemUrl)) return items, title
def extractGoogleDriveFolder(self, driveUrl): ''' Extract all the relevant links from a google drive directory, and push them into the queued URL queue. ''' newLinks = [] self.log.info("Fetching drive container page") docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl) # print('docReferences', docReferences) for dummy_title, url in docReferences: url = urlFuncs.trimGDocUrl(url) if url not in newLinks: newLinks.append(url) self.log.info("Generating google drive disambiguation page!") soup = gdp.makeDriveDisambiguation(docReferences, pgTitle) # print(disamb) soup = self.relink(soup) disamb = soup.prettify() ret = {} ret['contents'] = disamb ret['title'] = pgTitle ret['plainLinks'] = newLinks ret['rsrcLinks'] = [] # drive folders don't have resources self.log.info("Found %s items in google drive directory", len(docReferences)) return ret
def extractGoogleDriveFolder(self, driveUrl): """ Extract all the relevant links from a google drive directory, and push them into the queued URL queue. """ newLinks = [] self.log.info("Fetching drive container page") docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl) # print('docReferences', docReferences) for dummy_title, url in docReferences: url = urlFuncs.trimGDocUrl(url) if url not in newLinks: newLinks.append(url) self.log.info("Generating google drive disambiguation page!") soup = gdp.makeDriveDisambiguation(docReferences, pgTitle) # print(disamb) soup = self.relink(soup) disamb = soup.prettify() ret = {} ret["contents"] = disamb ret["title"] = pgTitle ret["plainLinks"] = newLinks ret["rsrcLinks"] = [] # drive folders don't have resources self.log.info("Found %s items in google drive directory", len(docReferences)) return ret
def relink(self, soup, imRelink=None): # The google doc reader relinking mechanisms requires overriding the # image relinking mechanism. As such, allow that to be overridden # if needed # print("relink call!") # print(self._relinkDomains) if not imRelink: imRelink = self.convertToReaderImage for (isImg, tag, attr) in urlFuncs.urlContainingTargets: if not isImg: for link in soup.findAll(tag): try: # print("Link!", self.checkRelinkDomain(link[attr]), link[attr]) # if self.checkRelinkDomain(link[attr]): link[attr] = self.convertToReaderUrl(link[attr]) if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc: link[attr] = urlFuncs.trimGDocUrl(link[attr]) # print("Relinked", link[attr]) except TypeError: # Empty href tags, not sure how this happens. continue except KeyError: continue else: for link in soup.findAll(tag): try: link[attr] = imRelink(link[attr]) if tag == 'img': # Force images that are oversize to fit the window. link["style"] = 'max-width: 95%;' if 'width' in link.attrs: del link.attrs['width'] if 'height' in link.attrs: del link.attrs['height'] except TypeError: continue except KeyError: continue # Keyhole patch for fictionpress next/prev buttons onclick elements. for button in [item for item in soup.findAll('button') if item.has_attr("onclick")]: if button['onclick'].startswith("self.location='") \ and button['onclick'].endswith("'") \ and button['onclick'].count("'") == 2: prefix, url, postfix = button['onclick'].split("'") url = urlFuncs.rebaseUrl(url, self.pageUrl) url = self.convertToReaderUrl(url) button['onclick'] = "'".join((prefix, url, postfix)) return soup
def processLinkItem(self, url, baseUrl): url = urlFuncs.cleanUrl(url) if not url: return None # F*****g tumblr redirects. if url.startswith("https://www.tumblr.com/login"): return None for badword in self._badwords: if badword in url: return for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if not url: return None if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def relink(self, soup, imRelink=None): # The google doc reader relinking mechanisms requires overriding the # image relinking mechanism. As such, allow that to be overridden # if needed # print("relink call!") # print(self._relinkDomains) if not imRelink: imRelink = self.convertToReaderImage for (isImg, tag, attr) in urlFuncs.urlContainingTargets: if not isImg: for link in soup.findAll(tag): try: # print("Link!", self.checkRelinkDomain(link[attr]), link[attr]) # if self.checkRelinkDomain(link[attr]): link[attr] = self.convertToReaderUrl(link[attr]) if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc: link[attr] = urlFuncs.trimGDocUrl(link[attr]) # print("Relinked", link[attr]) except KeyError: continue else: for link in soup.findAll(tag): try: link[attr] = imRelink(link[attr]) if tag == 'img': # Force images that are oversize to fit the window. link["style"] = 'max-width: 95%;' if 'width' in link.attrs: del link.attrs['width'] if 'height' in link.attrs: del link.attrs['height'] except KeyError: continue # Keyhole patch for fictionpress next/prev buttons onclick elements. for button in [item for item in soup.findAll('button') if item.has_attr("onclick")]: if button['onclick'].startswith("self.location='") \ and button['onclick'].endswith("'") \ and button['onclick'].count("'") == 2: prefix, url, postfix = button['onclick'].split("'") url = urlFuncs.rebaseUrl(url, self.pageUrl) url = self.convertToReaderUrl(url) button['onclick'] = "'".join((prefix, url, postfix)) return soup
def processLinkItem(self, url, baseUrl): url = urlFuncs.cleanUrl(url) if not url: return None # F*****g tumblr redirects. if url.startswith("https://www.tumblr.com/login"): return None for badword in self._badwords: if badword in url: return for badword in self._badwords: if badword in url: return url = urlFuncs.urlClean(url) if "google.com" in urllib.parse.urlsplit(url.lower()).netloc: url = urlFuncs.trimGDocUrl(url) if url.startswith('https://docs.google.com/document/d/images'): return # self.log.info("Resolved URL = '%s'", url) ret = self.processNewUrl(url, baseUrl) return ret # self.log.info("New G link: '%s'", url) else: # Remove any URL fragments causing multiple retreival of the same resource. if url != urlFuncs.trimGDocUrl(url): print('Old URL: "%s"' % url) print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url)) raise ValueError("Wat? Url change? Url: '%s'" % url) ret = self.processNewUrl(url, baseUrl) # print("Returning:", ret) return ret
def __init__(self, targetUrl): isGdoc, url = urlFuncs.isGdocUrl(targetUrl) if not isGdoc: raise ValueError("Passed URL '%s' is not a google document?" % targetUrl) url = urlFuncs.trimGDocUrl(url) self.url = url+'/export?format=zip' self.refererUrl = targetUrl self.document = '' self.currentChunk = ''
def __init__(self, targetUrl): isGdoc, url = urlFuncs.isGdocUrl(targetUrl) if not isGdoc: raise ValueError("Passed URL '%s' is not a google document?" % targetUrl) url = urlFuncs.trimGDocUrl(url) self.url = url + '/export?format=zip' self.refererUrl = targetUrl self.document = '' self.currentChunk = ''
def processGdocPage(self, url, content): dummy_fName, content = content soup = WebRequest.as_soup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody
def processGdocPage(self, url, content): dummy_fName, content = content soup = common.util.webFunctions.as_soup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody