def getDriveFileUrls(cls, url): ctnt, handle = cls.wg.getpage(url, returnMultiple=True) # Pull out the title for the disambiguation page. soup = common.util.webFunctions.as_soup(ctnt) title = soup.title.string # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL, # which tells us if we redirected to a plain google doc, and just return that if the redirect occured. handleUrl = handle.geturl() if handleUrl != url: if urlFuncs.isGdocUrl(handleUrl): cls.log.info("Direct read redirect: '%s'", handleUrl) handleUrl = urlFuncs.trimGDocUrl(handleUrl) return [(title, handleUrl)], title jsRe = re.compile('var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL) items = jsRe.findall(ctnt) assert len(items) == 1 data = '{cont}'.format(cont=items.pop().strip()) conf = jsLiteralParse.jsParse(data) # The keys+data in the data/conf are: # 'folderName' - Title of the folder, just a string # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item. # Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for # Every file, even if they're different docs types. # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems' assert 'viewerItems' in conf assert 'folderName' in conf title = conf['folderName'] pages = conf['viewerItems'] items = [] for page in pages: if len(page) != 18 and len(page) != 22: cls.log.error("json entry in page with an invalid length:") cls.log.error("%s", page) continue # Item 2 is the title, item 17 is the doc URL # The doc URL is unicode escaped, annoyingly itemTitle = page[2] itemUrl = page[17].encode('ascii').decode('unicode_escape') itemUrl = urlFuncs.trimGDocUrl(itemUrl) items.append((itemTitle, itemUrl)) return items, title
def getDriveFileUrls(cls, url): ctnt, handle = cls.wg.getpage(url, returnMultiple=True) # Pull out the title for the disambiguation page. soup = WebRequest.as_soup(ctnt) title = soup.title.string # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL, # which tells us if we redirected to a plain google doc, and just return that if the redirect occured. handleUrl = handle.geturl() if handleUrl != url: if urlFuncs.isGdocUrl(handleUrl): cls.log.info("Direct read redirect: '%s'", handleUrl) handleUrl = urlFuncs.trimGDocUrl(handleUrl) return [(title, handleUrl)], title jsRe = re.compile( 'var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL) items = jsRe.findall(ctnt) assert len(items) == 1 data = '{cont}'.format(cont=items.pop().strip()) conf = jsLiteralParse.jsParse(data) # The keys+data in the data/conf are: # 'folderName' - Title of the folder, just a string # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item. # Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for # Every file, even if they're different docs types. # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems' assert 'viewerItems' in conf assert 'folderName' in conf title = conf['folderName'] pages = conf['viewerItems'] items = [] for page in pages: if len(page) != 18 and len(page) != 22: cls.log.error("json entry in page with an invalid length:") cls.log.error("%s", page) continue # Item 2 is the title, item 17 is the doc URL # The doc URL is unicode escaped, annoyingly itemTitle = page[2] itemUrl = page[17].encode('ascii').decode('unicode_escape') itemUrl = urlFuncs.trimGDocUrl(itemUrl) items.append((itemTitle, itemUrl)) return items, title
def __init__(self, targetUrl): isGdoc, url = urlFuncs.isGdocUrl(targetUrl) if not isGdoc: raise ValueError("Passed URL '%s' is not a google document?" % targetUrl) url = urlFuncs.trimGDocUrl(url) self.url = url+'/export?format=zip' self.refererUrl = targetUrl self.document = '' self.currentChunk = ''
def __init__(self, targetUrl): isGdoc, url = urlFuncs.isGdocUrl(targetUrl) if not isGdoc: raise ValueError("Passed URL '%s' is not a google document?" % targetUrl) url = urlFuncs.trimGDocUrl(url) self.url = url + '/export?format=zip' self.refererUrl = targetUrl self.document = '' self.currentChunk = ''
def wantsUrl(url): return urlFuncs.isGdocUrl(url)[0]