def extractContent(self): self.log.info("Processing '%s' as HTML.", self.pageUrl) soup = bs4.BeautifulSoup(self.content) # Allow child-class hooking soup = self.preprocessBody(soup) # Clear out any particularly obnoxious content before doing any parsing. soup = self.decomposeItems(soup, self._decomposeBefore) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, self.pageUrl) # Conditionally pull out the page content and enqueue it. if self.checkDomain(self.pageUrl): plainLinks = self.extractLinks(soup, self.pageUrl) imageLinks = self.extractImages(soup, self.pageUrl) else: self.log.warn("Not extracting images or links for url '%s'", self.pageUrl) plainLinks = [] imageLinks = [] # Do the later cleanup to prep the content for local rendering. soup = self.decomposeItems(soup, self._decompose) soup = self.decomposeAdditional(soup) # Allow child-class hooking soup = self.postprocessBody(soup) # Process page with readability, extract title. pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl) if not self.ignoreMissingTitle: if 'has no title!' in pgTitle: self.log.warn("Page has no title: '%s' (len %s)", pgTitle, len(pgBody)) else: self.log.info("Page with title '%s' retreived.", pgTitle) ret = {} # If an item has both a plain-link and an image link, prefer the # image link, and delete it from the plain link list for link in imageLinks: if link in plainLinks: plainLinks.remove(link) ret['plainLinks'] = plainLinks ret['rsrcLinks'] = imageLinks ret['title'] = pgTitle ret['contents'] = pgBody return ret
def extractContent(self): self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content)) assert self.content # print(type(self.content)) soup = WebMirror.util.webFunctions.as_soup(self.content) # Allow child-class hooking soup = self.preprocessBody(soup) # Clear out any particularly obnoxious content before doing any parsing. soup = self.decomposeItems(soup, self._decomposeBefore) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, self.pageUrl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = self.extractLinks(soup, self.pageUrl) imageLinks = self.extractImages(soup, self.pageUrl) # Do the later cleanup to prep the content for local rendering. soup = self.decomposeItems(soup, self._decompose) soup = self.decomposeAdditional(soup) soup = self.spotPatch(soup) soup = self.destyleItems(soup) # Allow child-class hooking soup = self.postprocessBody(soup) soup = self.removeClasses(soup) soup = self.fixCss(soup) # Process page with readability, extract title. pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl) ret = {} # If an item has both a plain-link and an image link, prefer the # image link, and delete it from the plain link list for link in imageLinks: if link in plainLinks: plainLinks.remove(link) ret['plainLinks'] = plainLinks ret['rsrcLinks'] = imageLinks ret['title'] = pgTitle ret['contents'] = pgBody return ret
def extractContent(self): self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content)) assert self.content # print(type(self.content)) soup = WebMirror.util.webFunctions.as_soup(self.content) # Allow child-class hooking soup = self.preprocessBody(soup) # Clear out any particularly obnoxious content before doing any parsing. soup = self.decomposeItems(soup, self._decomposeBefore) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, self.pageUrl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = self.extractLinks(soup, self.pageUrl) imageLinks = self.extractImages(soup, self.pageUrl) # Do the later cleanup to prep the content for local rendering. soup = self.decomposeItems(soup, self._decompose) soup = self.decomposeAdditional(soup) soup = self.spotPatch(soup) soup = self.destyleItems(soup) # Allow child-class hooking soup = self.postprocessBody(soup) soup = self.removeClasses(soup) soup = self.fixCss(soup) # Process page with readability, extract title. pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl) ret = {} # If an item has both a plain-link and an image link, prefer the # image link, and delete it from the plain link list for link in imageLinks: if link in plainLinks: plainLinks.remove(link) ret['plainLinks'] = [] ret['rsrcLinks'] = [] ret['title'] = pgTitle ret['contents'] = pgBody return ret
def processGdocPage(self, url, content): dummy_fName, content = content soup = bs4.BeautifulSoup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody
def processGdocPage(self, url, content): dummy_fName, content = content soup = WebMirror.util.webFunctions.as_soup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody