def fetch(limit=100, retreive_all=False): "Crawl the feeds with the client crawler." from crawler.http_crawler import CrawlerScheduler scheduler = CrawlerScheduler(conf.CRAWLER_LOGIN, conf.CRAWLER_PASSWD) with scheduler.pool: scheduler.run(limit=limit, retreive_all=retreive_all) scheduler.wait()
def test_no_add_on_304(self): self.resp_status_code = 304 resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) scheduler = CrawlerScheduler('admin', 'admin') scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json()))
def test_no_add_on_304(self): scheduler = CrawlerScheduler('admin', 'admin') self.resp_status_code = 304 resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json()))
def test_http_crawler_add_articles(self): scheduler = CrawlerScheduler('admin', 'admin') resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(143, len(resp.json())) for art in resp.json(): self.assertFalse('srcset=' in art['content']) self.assertFalse('src="/' in art['content']) self.resp_status_code = 304 scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(143, len(resp.json()))
def test_matching_etag(self): self._reset_feeds_freshness(etag='fake etag') self.resp_headers = {'etag': 'fake etag'} resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) scheduler = CrawlerScheduler('admin', 'admin') scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) self._reset_feeds_freshness(etag='jarr/fake etag') self.resp_headers = {'etag': 'jarr/fake etag'} scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(18, len(resp.json())) self._reset_feeds_freshness(etag='jarr/fake etag') self.resp_headers = {'etag': '########################'} scheduler.run() scheduler.wait() resp = self._api('get', 'articles', data={'limit': 1000}, user='******') self.assertEquals(143, len(resp.json()))