def test_next_page(self): html_1 = BeautifulSoup(data_next_page_1(), "lxml") page_url = crawler.next_page(html_1.body.ul) self.assertEqual(page_url, "https://github.com/search?p=2&q=python+crawler+json&type=Repositories") html_2 = BeautifulSoup(data_next_page_2(), "lxml") page_url = crawler.next_page(html_2.body.ul) self.assertEqual(page_url, None)
def MultiPageDownload(): # MultiPageDownload 函数负责一次下载多个页面 total_number_of_page = int(input("请输入一共要下载页数 : ")) current_page_number = 1 # 默认当前页数为1 full_url = crawler.ask_tag() # 询问tag current_page_html, current_page_url = crawler.determineTag(full_url) # 判断tag是否存在 coreDL(current_page_html) while current_page_number < total_number_of_page: # 多页面下载循环 (next_page_url, next_page_number) = crawler.next_page(current_page_url, current_page_number) current_page_number = current_page_number + 1 coreDL(crawler.getSource(next_page_url)) return True
def SinglePageDownload(): # SinglePageDownlaod 函数每次下载一页 完成后询问是否继续 full_url = crawler.ask_tag() # 同上询问tag current_page_html, current_page_url = crawler.determineTag(full_url) # 同上判断tag是否存在 coreDL(current_page_html) # 下载第一页 FLAG = True # 比较关键的FLAG 不是很喜欢while True 有一个flag比较好控制吧? current_page_number = 1 # 同上默认第一页 while FLAG == True: answer = input('是否下载下一页内容:[Y/N]') if answer[0] == 'Y' or answer[0] == 'y': print(current_page_url) (next_page_url, next_page_number) = crawler.next_page(current_page_url, current_page_number) print(next_page_url, next_page_number) current_page_number = current_page_number + 1 coreDL(crawler.getSource(next_page_url)) FLAG = True else: # 这里跳FLAG FLAG = False return True