def processGdocPage(self, url, content): dummy_fName, content = content soup = WebRequest.as_soup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody
def processGdocPage(self, url, content): dummy_fName, content = content soup = common.util.webFunctions.as_soup(content) urlFuncs.canonizeUrls(soup, url) pgTitle, soup = self.cleanGdocPage(soup, url) plainLinks = self.extractLinks(soup, url) self.log.info("Page title = '%s'", pgTitle) soup = self.relink(soup, imRelink=self.convertToGdocReaderImage) url = self.preprocessGdocReaderUrl(url) url = urlFuncs.trimGDocUrl(url) # Since the content we're extracting will be embedded into another page, we want to # strip out the <body> and <html> tags. `unwrap()` replaces the soup with the contents of the # tag it's called on. We end up with just the contents of the <body> tag. soup.body.unwrap() pgBody = soup.prettify() # No image links, since they're served as resource files in a google doc imageLinks = [] return plainLinks, imageLinks, pgTitle, pgBody
def exposed_retrigger_feed_urls(): ''' Retrigger the content urls from each feed item. ''' # RssFeedPost attributes: # id # type # feed_id # contenturl # contentid # title # contents # updated # published # tag_rel # author_rel # tags # author urls = set() with db.session_context() as sess: processor = WebMirror.processor.RssProcessor.RssProcessor( loggerPath="Main.RssDb", pageUrl='http://www.example.org', pgContent='', type='application/atom+xml', transfer=False, debug_print=True, db_sess=sess, write_debug=False) print("Loading posts....") items = sess.query(db.RssFeedPost).all() print("Loaded %s rows" % len(items)) have_content = [tmp for tmp in items if tmp.contents] print("%s rows have content" % len(have_content)) pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs") for post in pbar: if post.contenturl.startswith("tag:blogger.com"): continue if post.contenturl and not '#comment_' in post.contenturl: urls.add(post.contenturl) if post.contents and post.contents != 'Disabled?' and post.contents != 'wat': soup = WebRequest.as_soup(post.contents) # print(post.contents) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, post.contenturl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = processor.extractLinks(soup, post.contenturl) imageLinks = processor.extractImages(soup, post.contenturl) # if plainLinks or imageLinks: # print((len(plainLinks), len(imageLinks))) urls.update(plainLinks) urls.update(imageLinks) # pbar.set_description("Links: %s" % len(urls)) urls = list(urls) urld = {} for url in [tmp for tmp in urls if tmp]: nl = urllib.parse.urlsplit(url).netloc if nl: urld.setdefault(nl, []) urld[nl].append(url) print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld))) # rules = WebMirror.rules.load_rules() # feeds = [item['feedurls'] for item in rules] # feeds = [item for sublist in feeds for item in sublist] # url = feeds[0] # parsed = urllib.parse.urlparse(url) # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", "")) # print("Using feed url %s for job base" % url) try: with db.session_context() as sess: archiver = SiteArchiver(None, sess, None) for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'): sel_url = urls[0] parsed = urllib.parse.urlparse(sel_url) root = urllib.parse.urlunparse( (parsed[0], parsed[1], "", "", "", "")) job = db.WebPages( url=sel_url, starturl=root, netloc=key, distance=0, is_text=True, priority=db.DB_LOW_PRIORITY, type='unknown', fetchtime=datetime.datetime.now(), ) for chunk in chunks(urls, 500): archiver.upsertResponseLinks(job, plain=chunk, resource=[], debug=True, interactive=True) except Exception as e: traceback.print_exc()
def extractContent(self): feed = self.parseFeed(self.content) try: data = self.processFeed(feed, self.pageUrl) except Exception as e: self.log.critical("Failure parsing RSS feed!") for line in traceback.format_exc().split("\n"): self.log.critical(line) raise e plainLinks = [] rsrcLinks = [] if 'entries' in feed: for post in feed['entries']: if hasattr(post, 'contenturl') and post.contenturl.startswith( "tag:blogger.com"): continue if hasattr( post, 'contenturl' ) and post.contenturl and not '#comment_' in post.contenturl: plainLinks.append(post.contenturl) if hasattr( post, 'contents' ) and post.contents and post.contents != 'Disabled?' and post.contents != 'wat': soup = WebRequest.as_soup(post.contents) # print(post.contents) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, post.contenturl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks.extend(self.extractLinks(soup, post.contenturl)) rsrcLinks.extend(self.extractImages(soup, post.contenturl)) if 'links' in post: for link in post['links']: if 'href' in link: plainLinks.append(link['href']) if 'link' in post: plainLinks.append(post['link']) # I can't for the life of me remember why I added this. # self.normal_priority_links_trigger(plainLinks + rsrcLinks) output = bs4.BeautifulSoup("<html><body></body></html>", "lxml") output.html.body.append( output.new_tag("h3", text="RSS Feed for url '%s'" % self.pageUrl)) for feed_item in data: itemdiv = output.new_tag("div") temp = output.new_tag("h5", ) temp.string = feed_item['title'] itemdiv.append(temp) temp = output.new_tag( "a", href=feed_item['linkUrl'], ) temp.string = feed_item['linkUrl'] itemdiv.append(temp) temp = output.new_tag("p", ) temp.string = ", ".join( [str(author) for author in feed_item['authors']]) itemdiv.append(temp) temp = output.new_tag("p", ) temp.string = feed_item['contents'] itemdiv.append(temp) output.html.body.append(itemdiv) ret = {} ret['title'] = "RSS Feed for url '%s'" % self.pageUrl ret['contents'] = output.html.body.prettify() ret['mimeType'] = "text/html" ret['rss-content'] = (data) ret['plainLinks'] = plainLinks ret['rsrcLinks'] = rsrcLinks return ret
def extractContent(self): self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content)) assert self.content # print(type(self.content)) badxmlprefix = '<?xml version="1.0"?>' if self.content.strip().lower().startswith(badxmlprefix): self.content = self.content[len(badxmlprefix):] self.checkSquatters(self.content) soup = WebRequest.as_soup(self.content) # try: # soup = WebRequest.as_soup(self.content) # except AttributeError as e: # with open("badpage %s.html" % time.time(), "w") as fp: # fp.write(self.content) # raise e soup = self.prePatch(self.pageUrl, soup) # Allow child-class hooking soup = self.preprocessBody(soup) # Clear out any particularly obnoxious content before doing any parsing. soup = self.decomposeItems(soup, self._decomposeBefore) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, self.pageUrl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = self.extractLinks(soup, self.pageUrl) imageLinks = self.extractImages(soup, self.pageUrl) # Do the later cleanup to prep the content for local rendering. soup = self.decomposeItems(soup, self._decompose) soup = self.decomposeAdditional(soup) soup = self.spotPatch(soup) soup = self.destyleItems(soup) # Allow child-class hooking soup = self.postprocessBody(soup) soup = self.removeClasses(soup) soup = self.purgeEmptyTags(soup) soup = self.fixCss(soup) # Process page with readability, extract title. pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl) ret = {} # If an item has both a plain-link and an image link, prefer the # image link, and delete it from the plain link list for link in imageLinks: if link in plainLinks: plainLinks.remove(link) ret['plainLinks'] = plainLinks ret['rsrcLinks'] = imageLinks ret['title'] = pgTitle ret['contents'] = pgBody return ret
def extractContent(self): self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content)) assert self.content # print(type(self.content)) badxmlprefix = '<?xml version="1.0"?>' if self.content.strip().lower().startswith(badxmlprefix): self.content = self.content[len(badxmlprefix):] soup = common.util.webFunctions.as_soup(self.content) # try: # soup = common.util.webFunctions.as_soup(self.content) # except AttributeError as e: # with open("badpage %s.html" % time.time(), "w") as fp: # fp.write(self.content) # raise e # Allow child-class hooking soup = self.preprocessBody(soup) # Clear out any particularly obnoxious content before doing any parsing. soup = self.decomposeItems(soup, self._decomposeBefore) # Make all the page URLs fully qualified, so they're unambiguous soup = urlFuncs.canonizeUrls(soup, self.pageUrl) # pull out the page content and enqueue it. Filtering is # done in the parent. plainLinks = self.extractLinks(soup, self.pageUrl) imageLinks = self.extractImages(soup, self.pageUrl) # Do the later cleanup to prep the content for local rendering. soup = self.decomposeItems(soup, self._decompose) soup = self.decomposeAdditional(soup) soup = self.spotPatch(soup) soup = self.destyleItems(soup) # Allow child-class hooking soup = self.postprocessBody(soup) soup = self.removeClasses(soup) soup = self.fixCss(soup) # Process page with readability, extract title. pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl) ret = {} # If an item has both a plain-link and an image link, prefer the # image link, and delete it from the plain link list for link in imageLinks: if link in plainLinks: plainLinks.remove(link) ret['plainLinks'] = plainLinks ret['rsrcLinks'] = imageLinks ret['title'] = pgTitle ret['contents'] = pgBody return ret