Example #1
0
def extract(page):
    url = page.content_url
    content = Content(url=url, source=NATIVE)
    logging.info("fetching %r with native extractor" % (url,))
    body = page.raw_content
    try:
        soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info)
        content.body = page_parser.get_body(soup)
        content.title = page_parser.get_title(soup)
    except StandardError, e:
        raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
	def test_content_should_be_purged_if_it_is_older_than_1_day(self):
		old_content = Content(url=some_url, body='old body', title='old title', lastmod = datetime.utcnow() - timedelta(days=1, minutes=1))
		new_content = Content(url=some_url, body='new body', title='new title')
		old_content.put()
		new_content.put()
		Content.purge()
		kept_content = list(Content.all())
		self.assertEqual(kept_content, [new_content])
Example #3
0
def extract(page):
	url = page.content_url
	content = Content(url=url, source=VIEWTEXT)

	viewtext_url = "http://viewtext.org/api/text?url=%(url)s&format=json&rl=false" % {'url': urllib.quote(url)}
	logging.debug("fetching: %s with viewtext extractor" % (viewtext_url,))
	response = fetch(viewtext_url, allow_truncated=False, deadline=20)
	if response.status_code >= 400:
		logging.warning("request returned status code %s\n%s" % (response.status_code, response.content))
		raise DownloadError("request returned status code %s" % (response.status_code,))

	response = json.loads(response.content)
	logging.info("got JSON response with keys: %s" % (response.keys(),))

	try:
		content.body = response['content']
		content.title = response['title']
	except KeyError, e:
		raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
Example #4
0
	def get(self):
		assert users.is_current_user_admin()
		logging.info("purging old content entries")
		Content.purge()
		logging.info("finished purging old content entries")
		self.response.out.write("ok")
Example #5
0
	def setUp(self):
		from pagefeed.models import Page, Content
		super(CleanDBTest, self).setUp()
		db.delete(Page.all())
		db.delete(Content.all())
Example #6
0
	def content_big_enough_by(self, n):
		body = 'a' * (Content.min_size + n - len(some_title))
		content = Content(url=some_url, title=some_title, body=body)
		self.assertEquals(content.size, Content.min_size + n)
		self.assertFalse(content.too_small())
		return content
Example #7
0
	def content_too_small_by(self, n):
		body = 'a' * (Content.min_size - n - len(some_title))
		content = Content(url=some_url, title=some_title, body=body)
		self.assertEquals(content.size, Content.min_size - n)
		self.assertTrue(content.too_small())
		return content
		def all_contents():
			return Content.all().fetch(5)