def test_gen_urls_for_ptt(self): l = [ i for i in pycheetah.gen_urls( 'https://www.ptt.cc/bbs/movie/index%d.html', product=[list(range(6188, 6189))]) ] self.assertEqual(1, len(l)) self.assertEqual('https://www.ptt.cc/bbs/movie/index6188.html', l[0])
def test_get_urls_exception(self): Classification = ['world'] with self.assertRaises(ValueError): [ i for i in pycheetah.gen_urls( 'https://www.theguardian.com/%s/%s/all', '2017/1/0', '2017/1/5', product=[Classification, 'date']) ]
def test_gen_urls_for_nyt(self): l = [ i for i in pycheetah.gen_urls( 'http://www.nytimes.com/indexes/%s/todayspaper/index.html', '2017/1/1', '2017/1/5', date_format='%Y/%m/%d', product=['date']) ] self.assertEqual(5, len(l))
def main(): pycheetah.init_logger() urls = list( pycheetah.gen_urls('https://www.ptt.cc/bbs/movie/index%d.html', product=[list(range(6180, 6189))])) result = Board.start(urls) urls = result.reduce_by('links') yield urls reseult = Article.start(urls) yield reseult.reduce_by('article')
def test_gen_urls_for_guardian(self): Classification = [ 'world', 'politics', 'sport', 'football', 'culture', 'business', 'lifeandstyle', 'fashion', 'environment', 'technology', 'travel' ] l = [ i for i in pycheetah.gen_urls( 'https://www.theguardian.com/%s/%s/all', '2017/1/1', '2017/1/5', product=[Classification, 'date']) ] self.assertEqual(55, len(l))
def main(): pycheetah.init_logger() urls = list( pycheetah.gen_urls( 'http://www.nytimes.com/indexes/%s/todayspaper/index.html', '2017/1/1', '2017/1/1', date_format='%Y/%m/%d', product=['date'])) result = DailyPage.start(urls) urls = result.reduce_by('urls') yield urls result = NewsPage.start(urls) yield result.reduce_by('title')
def main(): category = [ 'world', 'politics', 'sport', 'football', 'culture', 'business', 'lifeandstyle', 'fashion', 'environment', 'technology', 'travel' ] all_daily_urls = list( pycheetah.gen_urls('https://www.theguardian.com/%s/%s/all', '2017/1/1', '2017/1/1', product=[category, 'date'])) pycheetah.init_logger() result = DailyPage.start(all_daily_urls) urls = result.reduce_by('urls') yield urls result = NewsPage.start(urls) yield result.reduce_by('name')