Example #1
0
def get_url_set(driver, key_word):
    container_list = []  #存放临时URL
    real_url_set = set()  #URL集

    #百度搜索结果
    baidu_url = "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word
    driver.get(baidu_url)
    #获取每条搜索结果的URL
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.findAll(
            "div", {"class": re.compile(".*c-container.*")})
        container_list.extend(page_container_list)
        b = driver.find_element_by_xpath("//*[text()='下一页>']").click()

        time.sleep(2)

    #将每条URL进行一次跳转,得到初始URL,并添加进real_url_set中
    for container in container_list:
        href = container.find('h3').find('a').get('href')
        try:
            baidu_url = requests.get(url=href,
                                     headers=headers,
                                     allow_redirects=False)
        except:
            continue
        real_url = baidu_url.headers['Location']  #得到网页原始地址
        if real_url.startswith('http'):
            real_url_set.add(real_url + '\n')

    #必应搜索结果

    being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE"
    try:
        driver.get(being_url)
    except:
        driver.refresh()
    #需要刷新一下界面
    time.sleep(2)
    driver.refresh()
    time.sleep(5)
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.find("ol", {"id": "b_results"}).findAll("h2")
        for page_container in page_container_list:
            try:
                real_url_set.add(page_container.find("a").get('href'))
            except:
                break
        b = driver.find_element_by_xpath(".//*[@title='下一页']").click()
        time.sleep(2)

    #谷歌暂时没做
    #google_url=""

    #eia
    for i in range(MAX_PAGEs):
        eia_url = "https://search.usa.gov/search?affiliate=eia.doe.gov&page={}&query={}}&utf8=%E2%9C%93".format(
            i, key_word)
        response = requests.get(url=eia_url,
                                headers=headers,
                                allow_redirects=False)
        BF1.BF(response.text)
        page_container_list = BF1.findAll(
            "div", {"class": "content-block-item result"})
        for page_container in page_container_list:
            href = page_container.find('a').get('href')
            real_url_set.add(href)
    return real_url_set