Example #1
0
    def processNewUrl(self, url, baseUrl=None, istext=True):
        if not url.lower().startswith("http"):
            if baseUrl:
                # If we have a base-url to extract the scheme from, we pull that out, concatenate
                # it onto the rest of the url segments, and then unsplit that back into a full URL
                scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme
                rest = urllib.parse.urlsplit(baseUrl.lower())[1:]
                params = (scheme, ) + rest

                # self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url)
                url = urllib.parse.urlunsplit(params)

            elif self.ignoreBadLinks:
                self.log.error("Skipping a malformed URL!")
                self.log.error("Bad URL: '%s'", url)
                return
            else:
                raise ValueError("Url isn't a url: '%s'" % url)
        if gdp.isGdocUrl(url) or gdp.isGFileUrl(url):
            if gdp.trimGDocUrl(url) != url:
                raise ValueError("Invalid link crept through! Link: '%s'" %
                                 url)

        if not url.lower().startswith('http'):
            raise ValueError("Failure adding scheme to URL: '%s'" % url)

        if not self.checkDomain(url) and istext:
            raise ValueError("Invalid url somehow got through: '%s'" % url)

        if '/view/export?format=zip' in url:
            raise ValueError("Wat?")
        return url
Example #2
0
	def processNewUrl(self, url, baseUrl=None, istext=True):
		if not url.lower().startswith("http"):
			if baseUrl:
				# If we have a base-url to extract the scheme from, we pull that out, concatenate
				# it onto the rest of the url segments, and then unsplit that back into a full URL
				scheme = urllib.parse.urlsplit(baseUrl.lower()).scheme
				rest = urllib.parse.urlsplit(baseUrl.lower())[1:]
				params = (scheme, ) + rest

				# self.log.info("Had to add scheme (%s) to URL: '%s'", scheme, url)
				url = urllib.parse.urlunsplit(params)

			elif self.ignoreBadLinks:
				self.log.error("Skipping a malformed URL!")
				self.log.error("Bad URL: '%s'", url)
				return
			else:
				raise ValueError("Url isn't a url: '%s'" % url)
		if gdp.isGdocUrl(url) or gdp.isGFileUrl(url):
			if gdp.trimGDocUrl(url) != url:
				raise ValueError("Invalid link crept through! Link: '%s'" % url)


		if not url.lower().startswith('http'):
			raise ValueError("Failure adding scheme to URL: '%s'" % url)

		if not self.checkDomain(url) and istext:
			raise ValueError("Invalid url somehow got through: '%s'" % url)

		if '/view/export?format=zip' in url:
			raise ValueError("Wat?")
		return url
Example #3
0
	def extractGoogleDriveFolder(self, driveUrl):
		'''
		Extract all the relevant links from a google drive directory, and push them into
		the queued URL queue.

		'''

		newLinks = []
		self.log.info("Fetching drive container page")
		docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl)
		# print('docReferences', docReferences)
		for dummy_title, url in docReferences:
			url = gdp.trimGDocUrl(url)
			if url not in newLinks:
				newLinks.append(url)

		self.log.info("Generating google drive disambiguation page!")
		soup = gdp.makeDriveDisambiguation(docReferences, pgTitle)
		# print(disamb)

		soup = self.relink(soup)

		disamb = soup.prettify()

		ret = {}

		ret['contents']   = disamb
		ret['title']      = pgTitle
		ret['plainLinks'] = newLinks
		ret['rsrcLinks']  = []  # drive folders don't have resources


		self.log.info("Found %s items in google drive directory", len(docReferences))

		return ret
Example #4
0
    def extractGoogleDriveFolder(self, driveUrl):
        '''
		Extract all the relevant links from a google drive directory, and push them into
		the queued URL queue.

		'''

        newLinks = []
        self.log.info("Fetching drive container page")
        docReferences, pgTitle = gdp.GDocExtractor.getDriveFileUrls(driveUrl)
        # print('docReferences', docReferences)
        for dummy_title, url in docReferences:
            url = gdp.trimGDocUrl(url)
            if url not in newLinks:
                newLinks.append(url)

        self.log.info("Generating google drive disambiguation page!")
        soup = gdp.makeDriveDisambiguation(docReferences, pgTitle)
        # print(disamb)

        soup = self.relink(soup)

        disamb = soup.prettify()

        ret = {}

        ret['contents'] = disamb
        ret['title'] = pgTitle
        ret['plainLinks'] = newLinks
        ret['rsrcLinks'] = []  # drive folders don't have resources

        self.log.info("Found %s items in google drive directory",
                      len(docReferences))

        return ret
Example #5
0
	def processLinkItem(self, url, baseUrl):
		url = gdp.clearOutboundProxy(url)
		url = gdp.clearBitLy(url)

		# Filter by domain
		if not self.checkDomain(url):
			# print("Filtering", self.checkDomain(url), url)
			return


		# and by blocked words
		for badword in self._badwords:
			if badword in url:
				# print("hadbad", self.checkDomain(url), url)

				return



		if not self.checkFollowGoogleUrl(url):
			return

		url = urlFuncs.urlClean(url)

		if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
			url = gdp.trimGDocUrl(url)

			if url.startswith('https://docs.google.com/document/d/images'):
				return

			# self.log.info("Resolved URL = '%s'", url)
			ret = self.processNewUrl(url, baseUrl)
			return ret
			# self.log.info("New G link: '%s'", url)

		else:
			# Remove any URL fragments causing multiple retreival of the same resource.
			if url != gdp.trimGDocUrl(url):
				print('Old URL: "%s"' % url)
				print('Trimmed: "%s"' % gdp.trimGDocUrl(url))
				raise ValueError("Wat? Url change? Url: '%s'" % url)
			ret = self.processNewUrl(url, baseUrl)
			# print("Returning:", ret)
			return ret
Example #6
0
    def processLinkItem(self, url, baseUrl):
        url = gdp.clearOutboundProxy(url)
        url = gdp.clearBitLy(url)

        # Filter by domain
        if not self.checkDomain(url):
            # print("Filtering", self.checkDomain(url), url)
            return

        # and by blocked words
        for badword in self._badwords:
            if badword in url:
                # print("hadbad", self.checkDomain(url), url)

                return

        if not self.checkFollowGoogleUrl(url):
            return

        url = TextScrape.urlFuncs.urlClean(url)

        if "google.com" in urllib.parse.urlsplit(url.lower()).netloc:
            url = gdp.trimGDocUrl(url)

            if url.startswith('https://docs.google.com/document/d/images'):
                return

            # self.log.info("Resolved URL = '%s'", url)
            return self.processNewUrl(url, baseUrl)
            # self.log.info("New G link: '%s'", url)

        else:
            # Remove any URL fragments causing multiple retreival of the same resource.
            if url != gdp.trimGDocUrl(url):
                print('Old URL: "%s"' % url)
                print('Trimmed: "%s"' % gdp.trimGDocUrl(url))
                raise ValueError("Wat? Url change? Url: '%s'" % url)
            return self.processNewUrl(url, baseUrl)
Example #7
0
def urlClean(url):
	# Google docs can be accessed with or without the '/preview' postfix
	# We want to remove this if it's present, so we don't duplicate content.
	url = gdp.trimGDocUrl(url)

	while True:
		url2 = urllib.parse.unquote(url)
		url2 = url2.split("#")[0]
		if url2 == url:
			break
		url = url2

	# Clean off whitespace.
	url = url.strip()

	return url
Example #8
0
def urlClean(url):
	# Google docs can be accessed with or without the '/preview' postfix
	# We want to remove this if it's present, so we don't duplicate content.
	url = gdp.trimGDocUrl(url)

	while True:
		url2 = urllib.parse.unquote(url)
		url2 = url2.split("#")[0]
		if url2 == url:
			break
		url = url2

	# Clean off whitespace.
	url = url.strip()

	return url
Example #9
0
	def relink(self, soup, imRelink=None):
		# The google doc reader relinking mechanisms requires overriding the
		# image relinking mechanism. As such, allow that to be overridden
		# if needed
		# print("relink call!")
		# print(self._relinkDomains)
		if not imRelink:
			imRelink = self.convertToReaderImage


		for (isImg, tag, attr) in urlFuncs.urlContainingTargets:

			if not isImg:
				for link in soup.findAll(tag):
					try:
						# print("Link!", self.checkRelinkDomain(link[attr]), link[attr])
						if self.checkRelinkDomain(link[attr]):
							link[attr] = self.convertToReaderUrl(link[attr])

						if "google.com" in urllib.parse.urlsplit(link[attr].lower()).netloc:
							link[attr] = gdp.trimGDocUrl(link[attr])
							# print("Relinked", link[attr])
					except KeyError:
						continue

			else:
				for link in soup.findAll(tag):
					try:
						link[attr] = imRelink(link[attr])

						if tag == 'img':
							# Force images that are oversize to fit the window.
							link["style"] = 'max-width: 95%;'

							if 'width' in link.attrs:
								del link.attrs['width']
							if 'height' in link.attrs:
								del link.attrs['height']

					except KeyError:
						continue

		return soup
Example #10
0
    def processGdocPage(self, url, content):
        dummy_fName, content = content
        print("Page size: ", len(content))
        soup = bs4.BeautifulSoup(content)
        TextScrape.urlFuncs.canonizeUrls(soup, url)

        pgTitle, soup = self.cleanGdocPage(soup, url)

        plainLinks = self.extractLinks(soup, url)
        self.log.info("Page title = '%s'", pgTitle)
        soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

        url = self.preprocessGdocReaderUrl(url)
        url = gdp.trimGDocUrl(url)
        # Since the content we're extracting will be embedded into another page, we want to
        # strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
        # tag it's called on. We end up with just the contents of the <body> tag.
        soup.body.unwrap()
        pgBody = soup.prettify()

        # No image links, since they're served as resource files in a google doc
        imageLinks = []
        return plainLinks, imageLinks, pgTitle, pgBody
Example #11
0
	def processGdocPage(self, url, content):
		dummy_fName, content = content
		print("Page size: ", len(content))
		soup = bs4.BeautifulSoup(content)
		TextScrape.urlFuncs.canonizeUrls(soup, url)

		pgTitle, soup = self.cleanGdocPage(soup, url)

		plainLinks = self.extractLinks(soup, url)
		self.log.info("Page title = '%s'", pgTitle)
		soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

		url = self.preprocessGdocReaderUrl(url)
		url = gdp.trimGDocUrl(url)
		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		soup.body.unwrap()
		pgBody = soup.prettify()

		# No image links, since they're served as resource files in a google doc
		imageLinks = []
		return plainLinks, imageLinks, pgTitle, pgBody