def extractContent(self):
		self.log.info("Processing '%s' as HTML.", self.pageUrl)
		soup = bs4.BeautifulSoup(self.content)


		# Allow child-class hooking
		soup = self.preprocessBody(soup)

		# Clear out any particularly obnoxious content before doing any parsing.
		soup = self.decomposeItems(soup, self._decomposeBefore)

		# Make all the page URLs fully qualified, so they're unambiguous
		soup = urlFuncs.canonizeUrls(soup, self.pageUrl)

		# Conditionally pull out the page content and enqueue it.
		if self.checkDomain(self.pageUrl):
			plainLinks = self.extractLinks(soup, self.pageUrl)
			imageLinks = self.extractImages(soup, self.pageUrl)
		else:
			self.log.warn("Not extracting images or links for url '%s'", self.pageUrl)
			plainLinks = []
			imageLinks = []

		# Do the later cleanup to prep the content for local rendering.
		soup = self.decomposeItems(soup, self._decompose)

		soup = self.decomposeAdditional(soup)

		# Allow child-class hooking
		soup = self.postprocessBody(soup)

		# Process page with readability, extract title.
		pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl)
		if not self.ignoreMissingTitle:
			if 'has no title!' in pgTitle:
				self.log.warn("Page has no title: '%s' (len %s)", pgTitle, len(pgBody))
			else:
				self.log.info("Page with title '%s' retreived.", pgTitle)

		ret = {}

		# If an item has both a plain-link and an image link, prefer the
		# image link, and delete it from the plain link list
		for link in imageLinks:
			if link in plainLinks:
				plainLinks.remove(link)

		ret['plainLinks'] = plainLinks
		ret['rsrcLinks']  = imageLinks
		ret['title']      = pgTitle
		ret['contents']   = pgBody


		return ret
	def extractContent(self):
		self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content))
		assert self.content
		# print(type(self.content))
		soup = WebMirror.util.webFunctions.as_soup(self.content)


		# Allow child-class hooking
		soup = self.preprocessBody(soup)

		# Clear out any particularly obnoxious content before doing any parsing.
		soup = self.decomposeItems(soup, self._decomposeBefore)

		# Make all the page URLs fully qualified, so they're unambiguous
		soup = urlFuncs.canonizeUrls(soup, self.pageUrl)

		# pull out the page content and enqueue it. Filtering is
		# done in the parent.
		plainLinks = self.extractLinks(soup, self.pageUrl)
		imageLinks = self.extractImages(soup, self.pageUrl)

		# Do the later cleanup to prep the content for local rendering.
		soup = self.decomposeItems(soup, self._decompose)

		soup = self.decomposeAdditional(soup)
		soup = self.spotPatch(soup)
		soup = self.destyleItems(soup)

		# Allow child-class hooking
		soup = self.postprocessBody(soup)

		soup = self.removeClasses(soup)

		soup = self.fixCss(soup)

		# Process page with readability, extract title.
		pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl)

		ret = {}

		# If an item has both a plain-link and an image link, prefer the
		# image link, and delete it from the plain link list
		for link in imageLinks:
			if link in plainLinks:
				plainLinks.remove(link)

		ret['plainLinks'] = plainLinks
		ret['rsrcLinks']  = imageLinks
		ret['title']      = pgTitle
		ret['contents']   = pgBody


		return ret
Beispiel #3
0
	def extractContent(self):
		self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content))
		assert self.content
		# print(type(self.content))
		soup = WebMirror.util.webFunctions.as_soup(self.content)


		# Allow child-class hooking
		soup = self.preprocessBody(soup)

		# Clear out any particularly obnoxious content before doing any parsing.
		soup = self.decomposeItems(soup, self._decomposeBefore)

		# Make all the page URLs fully qualified, so they're unambiguous
		soup = urlFuncs.canonizeUrls(soup, self.pageUrl)

		# pull out the page content and enqueue it. Filtering is
		# done in the parent.
		plainLinks = self.extractLinks(soup, self.pageUrl)
		imageLinks = self.extractImages(soup, self.pageUrl)

		# Do the later cleanup to prep the content for local rendering.
		soup = self.decomposeItems(soup, self._decompose)

		soup = self.decomposeAdditional(soup)
		soup = self.spotPatch(soup)
		soup = self.destyleItems(soup)

		# Allow child-class hooking
		soup = self.postprocessBody(soup)

		soup = self.removeClasses(soup)

		soup = self.fixCss(soup)

		# Process page with readability, extract title.
		pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl)

		ret = {}

		# If an item has both a plain-link and an image link, prefer the
		# image link, and delete it from the plain link list
		for link in imageLinks:
			if link in plainLinks:
				plainLinks.remove(link)

		ret['plainLinks'] = []
		ret['rsrcLinks']  = []
		ret['title']      = pgTitle
		ret['contents']   = pgBody


		return ret
	def processGdocPage(self, url, content):
		dummy_fName, content = content
		soup = bs4.BeautifulSoup(content)
		urlFuncs.canonizeUrls(soup, url)

		pgTitle, soup = self.cleanGdocPage(soup, url)

		plainLinks = self.extractLinks(soup, url)
		self.log.info("Page title = '%s'", pgTitle)
		soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

		url = self.preprocessGdocReaderUrl(url)
		url = urlFuncs.trimGDocUrl(url)
		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		soup.body.unwrap()
		pgBody = soup.prettify()

		# No image links, since they're served as resource files in a google doc
		imageLinks = []
		return plainLinks, imageLinks, pgTitle, pgBody
Beispiel #5
0
    def processGdocPage(self, url, content):
        dummy_fName, content = content
        soup = WebMirror.util.webFunctions.as_soup(content)
        urlFuncs.canonizeUrls(soup, url)

        pgTitle, soup = self.cleanGdocPage(soup, url)

        plainLinks = self.extractLinks(soup, url)
        self.log.info("Page title = '%s'", pgTitle)
        soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

        url = self.preprocessGdocReaderUrl(url)
        url = urlFuncs.trimGDocUrl(url)
        # Since the content we're extracting will be embedded into another page, we want to
        # strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
        # tag it's called on. We end up with just the contents of the <body> tag.
        soup.body.unwrap()
        pgBody = soup.prettify()

        # No image links, since they're served as resource files in a google doc
        imageLinks = []
        return plainLinks, imageLinks, pgTitle, pgBody