Example #1
0
	def getDriveFileUrls(cls, url):
		ctnt, handle = cls.wg.getpage(url, returnMultiple=True)

		# Pull out the title for the disambiguation page.
		soup = common.util.webFunctions.as_soup(ctnt)
		title = soup.title.string

		# Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL,
		# which tells us if we redirected to a plain google doc, and just return that if the redirect occured.
		handleUrl = handle.geturl()
		if handleUrl != url:
			if urlFuncs.isGdocUrl(handleUrl):
				cls.log.info("Direct read redirect: '%s'", handleUrl)
				handleUrl = urlFuncs.trimGDocUrl(handleUrl)
				return [(title, handleUrl)], title

		jsRe = re.compile('var data = (.*?); _initFolderLandingPageApplication\(config, data\)', re.DOTALL)

		items = jsRe.findall(ctnt)
		assert len(items) == 1

		data = '{cont}'.format(cont=items.pop().strip())
		conf = jsLiteralParse.jsParse(data)

		# The keys+data in the data/conf are:
		# 'folderName'  - Title of the folder, just a string
		# 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item.
		# 				Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for
		# 				Every file, even if they're different docs types.
		# 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems'

		assert 'viewerItems' in conf
		assert 'folderName' in conf

		title = conf['folderName']

		pages = conf['viewerItems']

		items = []
		for page in pages:
			if len(page) != 18 and len(page) != 22:
				cls.log.error("json entry in page with an invalid length:")
				cls.log.error("%s", page)
				continue


			# Item 2 is the title, item 17 is the doc URL
			# The doc URL is unicode escaped, annoyingly
			itemTitle = page[2]
			itemUrl   = page[17].encode('ascii').decode('unicode_escape')

			itemUrl = urlFuncs.trimGDocUrl(itemUrl)

			items.append((itemTitle, itemUrl))


		return items, title
Example #2
0
    def getDriveFileUrls(cls, url):
        ctnt, handle = cls.wg.getpage(url, returnMultiple=True)

        # Pull out the title for the disambiguation page.
        soup = WebRequest.as_soup(ctnt)
        title = soup.title.string

        # Google drive supports a `read?{google doc path} mode. As such, we look at the actual URL,
        # which tells us if we redirected to a plain google doc, and just return that if the redirect occured.
        handleUrl = handle.geturl()
        if handleUrl != url:
            if urlFuncs.isGdocUrl(handleUrl):
                cls.log.info("Direct read redirect: '%s'", handleUrl)
                handleUrl = urlFuncs.trimGDocUrl(handleUrl)
                return [(title, handleUrl)], title

        jsRe = re.compile(
            'var data = (.*?); _initFolderLandingPageApplication\(config, data\)',
            re.DOTALL)

        items = jsRe.findall(ctnt)
        assert len(items) == 1

        data = '{cont}'.format(cont=items.pop().strip())
        conf = jsLiteralParse.jsParse(data)

        # The keys+data in the data/conf are:
        # 'folderName'  - Title of the folder, just a string
        # 'viewerItems' - List of lists of the items in the folder, which contains the title, previewimage, and url for each item.
        # 				Other stuff (mime types) for the files, but they're all google internal mime-types and look to be the same for
        # 				Every file, even if they're different docs types.
        # 'folderModel' - List of UID and the view URL. Looks to be completely redundant, as all the information is also in 'viewerItems'

        assert 'viewerItems' in conf
        assert 'folderName' in conf

        title = conf['folderName']

        pages = conf['viewerItems']

        items = []
        for page in pages:
            if len(page) != 18 and len(page) != 22:
                cls.log.error("json entry in page with an invalid length:")
                cls.log.error("%s", page)
                continue

            # Item 2 is the title, item 17 is the doc URL
            # The doc URL is unicode escaped, annoyingly
            itemTitle = page[2]
            itemUrl = page[17].encode('ascii').decode('unicode_escape')

            itemUrl = urlFuncs.trimGDocUrl(itemUrl)

            items.append((itemTitle, itemUrl))

        return items, title
    def extractGoogleDriveFolder(self, driveUrl):
        '''
		Extract all the relevant links from a google drive directory, and push them into
		the queued URL queue.

		'''

        newLinks = []
        self.log.info("Fetching drive container page")
        docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl)
        # print('docReferences', docReferences)
        for dummy_title, url in docReferences:
            url = urlFuncs.trimGDocUrl(url)
            if url not in newLinks:
                newLinks.append(url)

        self.log.info("Generating google drive disambiguation page!")
        soup = gdp.makeDriveDisambiguation(docReferences, pgTitle)
        # print(disamb)

        soup = self.relink(soup)

        disamb = soup.prettify()

        ret = {}

        ret['contents'] = disamb
        ret['title'] = pgTitle
        ret['plainLinks'] = newLinks
        ret['rsrcLinks'] = []  # drive folders don't have resources

        self.log.info("Found %s items in google drive directory",
                      len(docReferences))

        return ret
    def extractGoogleDriveFolder(self, driveUrl):
        """
		Extract all the relevant links from a google drive directory, and push them into
		the queued URL queue.

		"""

        newLinks = []
        self.log.info("Fetching drive container page")
        docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl)
        # print('docReferences', docReferences)
        for dummy_title, url in docReferences:
            url = urlFuncs.trimGDocUrl(url)
            if url not in newLinks:
                newLinks.append(url)

        self.log.info("Generating google drive disambiguation page!")
        soup = gdp.makeDriveDisambiguation(docReferences, pgTitle)
        # print(disamb)

        soup = self.relink(soup)

        disamb = soup.prettify()

        ret = {}

        ret["contents"] = disamb
        ret["title"] = pgTitle
        ret["plainLinks"] = newLinks
        ret["rsrcLinks"] = []  # drive folders don't have resources

        self.log.info("Found %s items in google drive directory", len(docReferences))

        return ret
Example #5
0
	def relink(self, soup, imRelink=None):
		# The google doc reader relinking mechanisms requires overriding the
		# image relinking mechanism. As such, allow that to be overridden
		# if needed
		# print("relink call!")
		# print(self._relinkDomains)
		if not imRelink:
			imRelink = self.convertToReaderImage


		for (isImg, tag, attr) in urlFuncs.urlContainingTargets:

			if not isImg:
				for link in soup.findAll(tag):
					try:
						# print("Link!", self.checkRelinkDomain(link[attr]), link[attr])
						# if self.checkRelinkDomain(link[attr]):
						link[attr] = self.convertToReaderUrl(link[attr])

						if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc:
							link[attr] = urlFuncs.trimGDocUrl(link[attr])
							# print("Relinked", link[attr])
					except TypeError:
						# Empty href tags, not sure how this happens.
						continue
					except KeyError:
						continue

			else:
				for link in soup.findAll(tag):
					try:
						link[attr] = imRelink(link[attr])

						if tag == 'img':
							# Force images that are oversize to fit the window.
							link["style"] = 'max-width: 95%;'

							if 'width' in link.attrs:
								del link.attrs['width']
							if 'height' in link.attrs:
								del link.attrs['height']

					except TypeError:
						continue
					except KeyError:
						continue


		# Keyhole patch for fictionpress next/prev buttons onclick elements.
		for button in [item for item in soup.findAll('button') if item.has_attr("onclick")]:
			if button['onclick'].startswith("self.location='") \
				and button['onclick'].endswith("'")            \
				and button['onclick'].count("'") == 2:
				prefix, url, postfix = button['onclick'].split("'")
				url = urlFuncs.rebaseUrl(url, self.pageUrl)
				url = self.convertToReaderUrl(url)
				button['onclick'] = "'".join((prefix, url, postfix))

		return soup
Example #6
0
    def processLinkItem(self, url, baseUrl):

        url = urlFuncs.cleanUrl(url)
        if not url:
            return None

        # F*****g tumblr redirects.
        if url.startswith("https://www.tumblr.com/login"):
            return None

        for badword in self._badwords:
            if badword in url:
                return

        for badword in self._badwords:
            if badword in url:
                return

        url = urlFuncs.urlClean(url)

        if not url:
            return None

        if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
            url = urlFuncs.trimGDocUrl(url)

            if url.startswith('https://docs.google.com/document/d/images'):
                return

            # self.log.info("Resolved URL = '%s'", url)
            ret = self.processNewUrl(url, baseUrl)
            return ret
            # self.log.info("New G link: '%s'", url)

        else:
            # Remove any URL fragments causing multiple retreival of the same resource.
            if url != urlFuncs.trimGDocUrl(url):
                print('Old URL: "%s"' % url)
                print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
                raise ValueError("Wat? Url change? Url: '%s'" % url)
            ret = self.processNewUrl(url, baseUrl)
            # print("Returning:", ret)
            return ret
	def relink(self, soup, imRelink=None):
		# The google doc reader relinking mechanisms requires overriding the
		# image relinking mechanism. As such, allow that to be overridden
		# if needed
		# print("relink call!")
		# print(self._relinkDomains)
		if not imRelink:
			imRelink = self.convertToReaderImage


		for (isImg, tag, attr) in urlFuncs.urlContainingTargets:

			if not isImg:
				for link in soup.findAll(tag):
					try:
						# print("Link!", self.checkRelinkDomain(link[attr]), link[attr])
						# if self.checkRelinkDomain(link[attr]):
						link[attr] = self.convertToReaderUrl(link[attr])

						if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc:
							link[attr] = urlFuncs.trimGDocUrl(link[attr])
							# print("Relinked", link[attr])
					except KeyError:
						continue

			else:
				for link in soup.findAll(tag):
					try:
						link[attr] = imRelink(link[attr])

						if tag == 'img':
							# Force images that are oversize to fit the window.
							link["style"] = 'max-width: 95%;'

							if 'width' in link.attrs:
								del link.attrs['width']
							if 'height' in link.attrs:
								del link.attrs['height']

					except KeyError:
						continue


		# Keyhole patch for fictionpress next/prev buttons onclick elements.
		for button in [item for item in soup.findAll('button') if item.has_attr("onclick")]:
			if button['onclick'].startswith("self.location='") \
				and button['onclick'].endswith("'")            \
				and button['onclick'].count("'") == 2:
				prefix, url, postfix = button['onclick'].split("'")
				url = urlFuncs.rebaseUrl(url, self.pageUrl)
				url = self.convertToReaderUrl(url)
				button['onclick'] = "'".join((prefix, url, postfix))

		return soup
	def processLinkItem(self, url, baseUrl):

		url = urlFuncs.cleanUrl(url)
		if not url:
			return None

		# F*****g tumblr redirects.
		if url.startswith("https://www.tumblr.com/login"):
			return None

		for badword in self._badwords:
			if badword in url:
				return

		for badword in self._badwords:
			if badword in url:
				return

		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = urlFuncs.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != urlFuncs.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % urlFuncs.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret
Example #9
0
	def __init__(self, targetUrl):

		isGdoc, url = urlFuncs.isGdocUrl(targetUrl)
		if not isGdoc:
			raise ValueError("Passed URL '%s' is not a google document?" % targetUrl)

		url = urlFuncs.trimGDocUrl(url)
		self.url = url+'/export?format=zip'
		self.refererUrl = targetUrl

		self.document = ''

		self.currentChunk = ''
Example #10
0
    def __init__(self, targetUrl):

        isGdoc, url = urlFuncs.isGdocUrl(targetUrl)
        if not isGdoc:
            raise ValueError("Passed URL '%s' is not a google document?" %
                             targetUrl)

        url = urlFuncs.trimGDocUrl(url)
        self.url = url + '/export?format=zip'
        self.refererUrl = targetUrl

        self.document = ''

        self.currentChunk = ''
Example #11
0
    def processGdocPage(self, url, content):
        dummy_fName, content = content
        soup = WebRequest.as_soup(content)
        urlFuncs.canonizeUrls(soup, url)

        pgTitle, soup = self.cleanGdocPage(soup, url)

        plainLinks = self.extractLinks(soup, url)
        self.log.info("Page title = '%s'", pgTitle)
        soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

        url = self.preprocessGdocReaderUrl(url)
        url = urlFuncs.trimGDocUrl(url)
        # Since the content we're extracting will be embedded into another page, we want to
        # strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
        # tag it's called on. We end up with just the contents of the <body> tag.
        soup.body.unwrap()
        pgBody = soup.prettify()

        # No image links, since they're served as resource files in a google doc
        imageLinks = []
        return plainLinks, imageLinks, pgTitle, pgBody
	def processGdocPage(self, url, content):
		dummy_fName, content = content
		soup = common.util.webFunctions.as_soup(content)
		urlFuncs.canonizeUrls(soup, url)

		pgTitle, soup = self.cleanGdocPage(soup, url)

		plainLinks = self.extractLinks(soup, url)
		self.log.info("Page title = '%s'", pgTitle)
		soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

		url = self.preprocessGdocReaderUrl(url)
		url = urlFuncs.trimGDocUrl(url)
		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		soup.body.unwrap()
		pgBody = soup.prettify()

		# No image links, since they're served as resource files in a google doc
		imageLinks = []
		return plainLinks, imageLinks, pgTitle, pgBody