def crawl(url): # get an index of all the pages in the section pages = parser.index(url) result = [] # parse each page of the section to find video page urls with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: future_to_page = {executor.submit(parser.video_pages, page): page for page in pages} for future in concurrent.futures.as_completed(future_to_page): result.extend(future.result()) # list of all the pages containing a video return result
def test_index_not_last_page(self, foo): expected = ['http://www.jeuxvideo.com/toutes-les-videos/type-7340/?p=' + str(i) for i in range(1, 303+1)] actual = index('http://www.jeuxvideo.com/toutes-les-videos/type-7340/?p=290') self.assertEqual(actual, expected)
def test_index_404(self, foo): expected = [] actual = index('url') self.assertEqual(actual, expected)