Esempio n. 1
0
    def processGdocPage(self, url, content):
        dummy_fName, content = content
        soup = WebRequest.as_soup(content)
        urlFuncs.canonizeUrls(soup, url)

        pgTitle, soup = self.cleanGdocPage(soup, url)

        plainLinks = self.extractLinks(soup, url)
        self.log.info("Page title = '%s'", pgTitle)
        soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

        url = self.preprocessGdocReaderUrl(url)
        url = urlFuncs.trimGDocUrl(url)
        # Since the content we're extracting will be embedded into another page, we want to
        # strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
        # tag it's called on. We end up with just the contents of the <body> tag.
        soup.body.unwrap()
        pgBody = soup.prettify()

        # No image links, since they're served as resource files in a google doc
        imageLinks = []
        return plainLinks, imageLinks, pgTitle, pgBody
	def processGdocPage(self, url, content):
		dummy_fName, content = content
		soup = common.util.webFunctions.as_soup(content)
		urlFuncs.canonizeUrls(soup, url)

		pgTitle, soup = self.cleanGdocPage(soup, url)

		plainLinks = self.extractLinks(soup, url)
		self.log.info("Page title = '%s'", pgTitle)
		soup = self.relink(soup, imRelink=self.convertToGdocReaderImage)

		url = self.preprocessGdocReaderUrl(url)
		url = urlFuncs.trimGDocUrl(url)
		# Since the content we're extracting will be embedded into another page, we want to
		# strip out the <body> and <html> tags. `unwrap()`  replaces the soup with the contents of the
		# tag it's called on. We end up with just the contents of the <body> tag.
		soup.body.unwrap()
		pgBody = soup.prettify()

		# No image links, since they're served as resource files in a google doc
		imageLinks = []
		return plainLinks, imageLinks, pgTitle, pgBody
Esempio n. 3
0
def exposed_retrigger_feed_urls():
    '''
	Retrigger the content urls from each feed item.
	'''

    # RssFeedPost attributes:
    # 	id
    # 	type
    # 	feed_id
    # 	contenturl
    # 	contentid
    # 	title
    # 	contents
    # 	updated
    # 	published
    # 	tag_rel
    # 	author_rel
    # 	tags
    # 	author

    urls = set()
    with db.session_context() as sess:
        processor = WebMirror.processor.RssProcessor.RssProcessor(
            loggerPath="Main.RssDb",
            pageUrl='http://www.example.org',
            pgContent='',
            type='application/atom+xml',
            transfer=False,
            debug_print=True,
            db_sess=sess,
            write_debug=False)

        print("Loading posts....")
        items = sess.query(db.RssFeedPost).all()
        print("Loaded %s rows" % len(items))
        have_content = [tmp for tmp in items if tmp.contents]
        print("%s rows have content" % len(have_content))

        pbar = tqdm.tqdm(items, desc="Retriggering RSS URLs")
        for post in pbar:
            if post.contenturl.startswith("tag:blogger.com"):
                continue

            if post.contenturl and not '#comment_' in post.contenturl:
                urls.add(post.contenturl)

            if post.contents and post.contents != 'Disabled?' and post.contents != 'wat':
                soup = WebRequest.as_soup(post.contents)
                # print(post.contents)
                # Make all the page URLs fully qualified, so they're unambiguous
                soup = urlFuncs.canonizeUrls(soup, post.contenturl)

                # pull out the page content and enqueue it. Filtering is
                # done in the parent.
                plainLinks = processor.extractLinks(soup, post.contenturl)
                imageLinks = processor.extractImages(soup, post.contenturl)

                # if plainLinks or imageLinks:
                # 	print((len(plainLinks), len(imageLinks)))

                urls.update(plainLinks)
                urls.update(imageLinks)
            # pbar.set_description("Links: %s" % len(urls))

    urls = list(urls)

    urld = {}
    for url in [tmp for tmp in urls if tmp]:
        nl = urllib.parse.urlsplit(url).netloc
        if nl:
            urld.setdefault(nl, [])
            urld[nl].append(url)

    print("Extracted %s unique links for %s netlocs" % (len(urls), len(urld)))

    # rules = WebMirror.rules.load_rules()
    # feeds = [item['feedurls'] for item in rules]
    # feeds = [item for sublist in feeds for item in sublist]
    # url = feeds[0]
    # parsed = urllib.parse.urlparse(url)
    # root = urllib.parse.urlunparse((parsed[0], parsed[1], "", "", "", ""))
    # print("Using feed url %s for job base" % url)

    try:
        with db.session_context() as sess:
            archiver = SiteArchiver(None, sess, None)
            for key, urls in tqdm.tqdm(urld.items(), desc='Source Netlocs'):
                sel_url = urls[0]
                parsed = urllib.parse.urlparse(sel_url)
                root = urllib.parse.urlunparse(
                    (parsed[0], parsed[1], "", "", "", ""))

                job = db.WebPages(
                    url=sel_url,
                    starturl=root,
                    netloc=key,
                    distance=0,
                    is_text=True,
                    priority=db.DB_LOW_PRIORITY,
                    type='unknown',
                    fetchtime=datetime.datetime.now(),
                )
                for chunk in chunks(urls, 500):
                    archiver.upsertResponseLinks(job,
                                                 plain=chunk,
                                                 resource=[],
                                                 debug=True,
                                                 interactive=True)

    except Exception as e:
        traceback.print_exc()
Esempio n. 4
0
    def extractContent(self):

        feed = self.parseFeed(self.content)

        try:
            data = self.processFeed(feed, self.pageUrl)
        except Exception as e:
            self.log.critical("Failure parsing RSS feed!")
            for line in traceback.format_exc().split("\n"):
                self.log.critical(line)
            raise e

        plainLinks = []
        rsrcLinks = []

        if 'entries' in feed:
            for post in feed['entries']:

                if hasattr(post, 'contenturl') and post.contenturl.startswith(
                        "tag:blogger.com"):
                    continue

                if hasattr(
                        post, 'contenturl'
                ) and post.contenturl and not '#comment_' in post.contenturl:
                    plainLinks.append(post.contenturl)

                if hasattr(
                        post, 'contents'
                ) and post.contents and post.contents != 'Disabled?' and post.contents != 'wat':
                    soup = WebRequest.as_soup(post.contents)
                    # print(post.contents)
                    # Make all the page URLs fully qualified, so they're unambiguous
                    soup = urlFuncs.canonizeUrls(soup, post.contenturl)

                    # pull out the page content and enqueue it. Filtering is
                    # done in the parent.
                    plainLinks.extend(self.extractLinks(soup, post.contenturl))
                    rsrcLinks.extend(self.extractImages(soup, post.contenturl))
                if 'links' in post:
                    for link in post['links']:
                        if 'href' in link:
                            plainLinks.append(link['href'])
                if 'link' in post:
                    plainLinks.append(post['link'])

        # I can't for the life of me remember why I added this.
        # self.normal_priority_links_trigger(plainLinks + rsrcLinks)

        output = bs4.BeautifulSoup("<html><body></body></html>", "lxml")

        output.html.body.append(
            output.new_tag("h3", text="RSS Feed for url '%s'" % self.pageUrl))

        for feed_item in data:
            itemdiv = output.new_tag("div")
            temp = output.new_tag("h5", )
            temp.string = feed_item['title']
            itemdiv.append(temp)
            temp = output.new_tag(
                "a",
                href=feed_item['linkUrl'],
            )
            temp.string = feed_item['linkUrl']
            itemdiv.append(temp)
            temp = output.new_tag("p", )
            temp.string = ", ".join(
                [str(author) for author in feed_item['authors']])
            itemdiv.append(temp)
            temp = output.new_tag("p", )
            temp.string = feed_item['contents']
            itemdiv.append(temp)

            output.html.body.append(itemdiv)

        ret = {}

        ret['title'] = "RSS Feed for url '%s'" % self.pageUrl
        ret['contents'] = output.html.body.prettify()
        ret['mimeType'] = "text/html"

        ret['rss-content'] = (data)
        ret['plainLinks'] = plainLinks
        ret['rsrcLinks'] = rsrcLinks

        return ret
Esempio n. 5
0
    def extractContent(self):
        self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl,
                      len(self.content))
        assert self.content
        # print(type(self.content))

        badxmlprefix = '<?xml version="1.0"?>'

        if self.content.strip().lower().startswith(badxmlprefix):
            self.content = self.content[len(badxmlprefix):]

        self.checkSquatters(self.content)

        soup = WebRequest.as_soup(self.content)
        # try:
        # 	soup = WebRequest.as_soup(self.content)
        # except AttributeError as e:
        # 	with open("badpage %s.html" % time.time(), "w") as fp:
        # 		fp.write(self.content)
        # 		raise e

        soup = self.prePatch(self.pageUrl, soup)

        # Allow child-class hooking
        soup = self.preprocessBody(soup)

        # Clear out any particularly obnoxious content before doing any parsing.
        soup = self.decomposeItems(soup, self._decomposeBefore)

        # Make all the page URLs fully qualified, so they're unambiguous
        soup = urlFuncs.canonizeUrls(soup, self.pageUrl)

        # pull out the page content and enqueue it. Filtering is
        # done in the parent.
        plainLinks = self.extractLinks(soup, self.pageUrl)
        imageLinks = self.extractImages(soup, self.pageUrl)

        # Do the later cleanup to prep the content for local rendering.
        soup = self.decomposeItems(soup, self._decompose)

        soup = self.decomposeAdditional(soup)
        soup = self.spotPatch(soup)
        soup = self.destyleItems(soup)

        # Allow child-class hooking
        soup = self.postprocessBody(soup)

        soup = self.removeClasses(soup)

        soup = self.purgeEmptyTags(soup)

        soup = self.fixCss(soup)

        # Process page with readability, extract title.
        pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl)

        ret = {}

        # If an item has both a plain-link and an image link, prefer the
        # image link, and delete it from the plain link list
        for link in imageLinks:
            if link in plainLinks:
                plainLinks.remove(link)

        ret['plainLinks'] = plainLinks
        ret['rsrcLinks'] = imageLinks
        ret['title'] = pgTitle
        ret['contents'] = pgBody

        return ret
	def extractContent(self):
		self.log.info("Processing '%s' as HTML (size: %s).", self.pageUrl, len(self.content))
		assert self.content
		# print(type(self.content))

		badxmlprefix = '<?xml version="1.0"?>'
		if self.content.strip().lower().startswith(badxmlprefix):
			self.content = self.content[len(badxmlprefix):]


		soup = common.util.webFunctions.as_soup(self.content)
		# try:
		# 	soup = common.util.webFunctions.as_soup(self.content)
		# except AttributeError as e:
		# 	with open("badpage %s.html" % time.time(), "w") as fp:
		# 		fp.write(self.content)
		# 		raise e


		# Allow child-class hooking
		soup = self.preprocessBody(soup)

		# Clear out any particularly obnoxious content before doing any parsing.
		soup = self.decomposeItems(soup, self._decomposeBefore)

		# Make all the page URLs fully qualified, so they're unambiguous
		soup = urlFuncs.canonizeUrls(soup, self.pageUrl)

		# pull out the page content and enqueue it. Filtering is
		# done in the parent.
		plainLinks = self.extractLinks(soup, self.pageUrl)
		imageLinks = self.extractImages(soup, self.pageUrl)

		# Do the later cleanup to prep the content for local rendering.
		soup = self.decomposeItems(soup, self._decompose)

		soup = self.decomposeAdditional(soup)
		soup = self.spotPatch(soup)
		soup = self.destyleItems(soup)

		# Allow child-class hooking
		soup = self.postprocessBody(soup)

		soup = self.removeClasses(soup)

		soup = self.fixCss(soup)

		# Process page with readability, extract title.
		pgTitle, pgBody = self.cleanHtmlPage(soup, url=self.pageUrl)

		ret = {}

		# If an item has both a plain-link and an image link, prefer the
		# image link, and delete it from the plain link list
		for link in imageLinks:
			if link in plainLinks:
				plainLinks.remove(link)

		ret['plainLinks'] = plainLinks
		ret['rsrcLinks']  = imageLinks
		ret['title']      = pgTitle
		ret['contents']   = pgBody


		return ret