def extract(page): url = page.content_url content = Content(url=url, source=NATIVE) logging.info("fetching %r with native extractor" % (url,)) body = page.raw_content try: soup = page_parser.parse(body, base_href=page.base_href, notify=logging.info) content.body = page_parser.get_body(soup) content.title = page_parser.get_title(soup) except StandardError, e: raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
def test_content_should_be_purged_if_it_is_older_than_1_day(self): old_content = Content(url=some_url, body='old body', title='old title', lastmod = datetime.utcnow() - timedelta(days=1, minutes=1)) new_content = Content(url=some_url, body='new body', title='new title') old_content.put() new_content.put() Content.purge() kept_content = list(Content.all()) self.assertEqual(kept_content, [new_content])
def extract(page): url = page.content_url content = Content(url=url, source=VIEWTEXT) viewtext_url = "http://viewtext.org/api/text?url=%(url)s&format=json&rl=false" % {'url': urllib.quote(url)} logging.debug("fetching: %s with viewtext extractor" % (viewtext_url,)) response = fetch(viewtext_url, allow_truncated=False, deadline=20) if response.status_code >= 400: logging.warning("request returned status code %s\n%s" % (response.status_code, response.content)) raise DownloadError("request returned status code %s" % (response.status_code,)) response = json.loads(response.content) logging.info("got JSON response with keys: %s" % (response.keys(),)) try: content.body = response['content'] content.title = response['title'] except KeyError, e: raise deferred.PermanentTaskFailure("%s: %s" % (type(e), e))
def get(self): assert users.is_current_user_admin() logging.info("purging old content entries") Content.purge() logging.info("finished purging old content entries") self.response.out.write("ok")
def setUp(self): from pagefeed.models import Page, Content super(CleanDBTest, self).setUp() db.delete(Page.all()) db.delete(Content.all())
def content_big_enough_by(self, n): body = 'a' * (Content.min_size + n - len(some_title)) content = Content(url=some_url, title=some_title, body=body) self.assertEquals(content.size, Content.min_size + n) self.assertFalse(content.too_small()) return content
def content_too_small_by(self, n): body = 'a' * (Content.min_size - n - len(some_title)) content = Content(url=some_url, title=some_title, body=body) self.assertEquals(content.size, Content.min_size - n) self.assertTrue(content.too_small()) return content
def all_contents(): return Content.all().fetch(5)