Exemple #1
0
 def test_gen_urls_for_ptt(self):
     l = [
         i for i in pycheetah.gen_urls(
             'https://www.ptt.cc/bbs/movie/index%d.html',
             product=[list(range(6188, 6189))])
     ]
     self.assertEqual(1, len(l))
     self.assertEqual('https://www.ptt.cc/bbs/movie/index6188.html', l[0])
Exemple #2
0
 def test_get_urls_exception(self):
     Classification = ['world']
     with self.assertRaises(ValueError):
         [
             i for i in pycheetah.gen_urls(
                 'https://www.theguardian.com/%s/%s/all',
                 '2017/1/0',
                 '2017/1/5',
                 product=[Classification, 'date'])
         ]
Exemple #3
0
 def test_gen_urls_for_nyt(self):
     l = [
         i for i in pycheetah.gen_urls(
             'http://www.nytimes.com/indexes/%s/todayspaper/index.html',
             '2017/1/1',
             '2017/1/5',
             date_format='%Y/%m/%d',
             product=['date'])
     ]
     self.assertEqual(5, len(l))
Exemple #4
0
def main():
    pycheetah.init_logger()
    urls = list(
        pycheetah.gen_urls('https://www.ptt.cc/bbs/movie/index%d.html',
                           product=[list(range(6180, 6189))]))

    result = Board.start(urls)
    urls = result.reduce_by('links')
    yield urls
    reseult = Article.start(urls)
    yield reseult.reduce_by('article')
Exemple #5
0
    def test_gen_urls_for_guardian(self):
        Classification = [
            'world', 'politics', 'sport', 'football', 'culture', 'business',
            'lifeandstyle', 'fashion', 'environment', 'technology', 'travel'
        ]

        l = [
            i for i in pycheetah.gen_urls(
                'https://www.theguardian.com/%s/%s/all',
                '2017/1/1',
                '2017/1/5',
                product=[Classification, 'date'])
        ]
        self.assertEqual(55, len(l))
Exemple #6
0
def main():
    pycheetah.init_logger()
    urls = list(
        pycheetah.gen_urls(
            'http://www.nytimes.com/indexes/%s/todayspaper/index.html',
            '2017/1/1',
            '2017/1/1',
            date_format='%Y/%m/%d',
            product=['date']))

    result = DailyPage.start(urls)
    urls = result.reduce_by('urls')
    yield urls
    result = NewsPage.start(urls)
    yield result.reduce_by('title')
Exemple #7
0
def main():
    category = [
        'world', 'politics', 'sport', 'football', 'culture', 'business',
        'lifeandstyle', 'fashion', 'environment', 'technology', 'travel'
    ]
    all_daily_urls = list(
        pycheetah.gen_urls('https://www.theguardian.com/%s/%s/all',
                           '2017/1/1',
                           '2017/1/1',
                           product=[category, 'date']))
    pycheetah.init_logger()
    result = DailyPage.start(all_daily_urls)
    urls = result.reduce_by('urls')
    yield urls
    result = NewsPage.start(urls)
    yield result.reduce_by('name')