def get_url_set(driver, key_word): container_list = [] #存放临时URL real_url_set = set() #URL集 #百度搜索结果 baidu_url = "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word driver.get(baidu_url) #获取每条搜索结果的URL for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.findAll( "div", {"class": re.compile(".*c-container.*")}) container_list.extend(page_container_list) b = driver.find_element_by_xpath("//*[text()='下一页>']").click() time.sleep(2) #将每条URL进行一次跳转,得到初始URL,并添加进real_url_set中 for container in container_list: href = container.find('h3').find('a').get('href') try: baidu_url = requests.get(url=href, headers=headers, allow_redirects=False) except: continue real_url = baidu_url.headers['Location'] #得到网页原始地址 if real_url.startswith('http'): real_url_set.add(real_url + '\n') #必应搜索结果 being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE" try: driver.get(being_url) except: driver.refresh() #需要刷新一下界面 time.sleep(2) driver.refresh() time.sleep(5) for page in range(1, MAX_PAGEs + 1): BF1 = BF(driver.page_source) #print(driver.page_source) page_container_list = BF1.find("ol", {"id": "b_results"}).findAll("h2") for page_container in page_container_list: try: real_url_set.add(page_container.find("a").get('href')) except: break b = driver.find_element_by_xpath(".//*[@title='下一页']").click() time.sleep(2) #谷歌暂时没做 #google_url="" #eia for i in range(MAX_PAGEs): eia_url = "https://search.usa.gov/search?affiliate=eia.doe.gov&page={}&query={}}&utf8=%E2%9C%93".format( i, key_word) response = requests.get(url=eia_url, headers=headers, allow_redirects=False) BF1.BF(response.text) page_container_list = BF1.findAll( "div", {"class": "content-block-item result"}) for page_container in page_container_list: href = page_container.find('a').get('href') real_url_set.add(href) return real_url_set