Ejemplo n.º 1
0
    def getDriveFileUrls(cls, url):
        ctnt, handle = cls.wg.getpage(url, returnMultiple=True)

        # Pull out the title for the disambiguation page.
        soup = WebMirror.util.webFunctions.as_soup(ctnt)
        title = soup.title.string

        # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL,
        # which tells us if we redirected to a plain google doc, and just return that if the redirect occured.
        handleUrl = handle.geturl()
        if handleUrl != url:
            if urlFuncs.isGdocUrl(handleUrl):
                cls.log.info("Direct read redirect: '%s'", handleUrl)
                handleUrl = urlFuncs.trimGDocUrl(handleUrl)
                return [(title, handleUrl)], title

        jsRe = re.compile(
            'var data = (.*?); _initFolderLandingPageApplication\(config, data\)',
            re.DOTALL)

        items = jsRe.findall(ctnt)
        assert len(items) == 1

        data = '{cont}'.format(cont=items.pop().strip())
        conf = jsLiteralParse.jsParse(data)

        # The keys+data in the data/conf are:
        # 'folderName'  - Title of the folder, just a string
        # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item.
        # 				Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for
        # 				Every file, even if they're different docs types.
        # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems'

        assert 'viewerItems' in conf
        assert 'folderName' in conf

        title = conf['folderName']

        pages = conf['viewerItems']

        items = []
        for page in pages:
            if len(page) != 18 and len(page) != 22:
                cls.log.error("json entry in page with an invalid length:")
                cls.log.error("%s", page)
                continue

            # Item 2 is the title, item 17 is the doc URL
            # The doc URL is unicode escaped, annoyingly
            itemTitle = page[2]
            itemUrl = page[17].encode('ascii').decode('unicode_escape')

            itemUrl = urlFuncs.trimGDocUrl(itemUrl)

            items.append((itemTitle, itemUrl))

        return items, title
Ejemplo n.º 2
0
	def getDriveFileUrls(cls, url):
		ctnt, handle = cls.wg.getpage(url, returnMultiple=True)

		# Pull out the title for the disambiguation page.
		soup = bs4.BeautifulSoup(ctnt)
		title = soup.title.string

		# Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL,
		# which tells us if we redirected to a plain google doc, and just return that if the redirect occured.
		handleUrl = handle.geturl()
		if handleUrl != url:
			if urlFuncs.isGdocUrl(handleUrl):
				cls.log.info("Direct read redirect: '%s'", handleUrl)
				handleUrl = urlFuncs.trimGDocUrl(handleUrl)
				return [(title, handleUrl)], title

		jsRe = re.compile('var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL)

		items = jsRe.findall(ctnt)
		assert len(items) == 1

		data = '{cont}'.format(cont=items.pop().strip())
		conf = jsLiteralParse.jsParse(data)

		# The keys+data in the data/conf are:
		# 'folderName'  - Title of the folder, just a string
		# 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item.
		# 				Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for
		# 				Every file, even if they're different docs types.
		# 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems'

		assert 'viewerItems' in conf
		assert 'folderName' in conf

		title = conf['folderName']

		pages = conf['viewerItems']

		items = []
		for page in pages:
			if len(page) != 18 and len(page) != 22:
				cls.log.error("json entry in page with an invalid length:")
				cls.log.error("%s", page)
				continue


			# Item 2 is the title, item 17 is the doc URL
			# The doc URL is unicode escaped, annoyingly
			itemTitle = page[2]
			itemUrl   = page[17].encode('ascii').decode('unicode_escape')

			itemUrl = urlFuncs.trimGDocUrl(itemUrl)

			items.append((itemTitle, itemUrl))


		return items, title
Ejemplo n.º 3
0
    def __init__(self, targetUrl):

        isGdoc, url = urlFuncs.isGdocUrl(targetUrl)
        if not isGdoc:
            raise ValueError("Passed URL '%s' is not a google document?" % targetUrl)

        url = urlFuncs.trimGDocUrl(url)
        self.url = url + "/export?format=zip"
        self.refererUrl = targetUrl

        self.document = ""

        self.currentChunk = ""
Ejemplo n.º 4
0
    def __init__(self, targetUrl):

        isGdoc, url = urlFuncs.isGdocUrl(targetUrl)
        if not isGdoc:
            raise ValueError("Passed URL '%s' is not a google document?" %
                             targetUrl)

        url = urlFuncs.trimGDocUrl(url)
        self.url = url + '/export?format=zip'
        self.refererUrl = targetUrl

        self.document = ''

        self.currentChunk = ''
	def wantsUrl(url):
		return urlFuncs.isGdocUrl(url)[0]
Ejemplo n.º 6
0
 def wantsUrl(url):
     return urlFuncs.isGdocUrl(url)[0]