Python BeautifulSoup.BF Examples

Programming Language: Python

Namespace/Package Name: bs4

Class/Type: BeautifulSoup

Method/Function: BF

Examples at hotexamples.com: 1

Python BeautifulSoup.BF - 1 examples found. These are the top rated real world Python examples of bs4.BeautifulSoup.BF extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

append(30)

BeautifulSoup(30)

__str__(30)

__init__(11)

attrs(10)

__len__(8)

__repr__(3)

__unicode__(2)

article(2)

__copy__(2)

__getattr__(2)

first(2)

findAllNext(2)

feed(1)

currentTag(1)

fartind(1)

BF(1)

filter_wikilinks(1)

fina_all(1)

fnd_all(1)

h1(1)

replace_with(1)

td(1)

toCSV(1)

copy(1)

alcohol(1)

astype(1)

assign(1)

apply(1)

add_structure(1)

add_shared_term(1)

a(1)

_title(1)

_repr_html_(1)

_find_all(1)

_all_strings(1)

__getitem__(1)

__contains__(1)

NavigableString(1)

Date(1)

wrap(1)

Example #1

Show file

def get_url_set(driver, key_word):
    container_list = []  #存放临时URL
    real_url_set = set()  #URL集

    #百度搜索结果
    baidu_url = "https://www.baidu.com/s?ie=UTF-8&wd=" + key_word
    driver.get(baidu_url)
    #获取每条搜索结果的URL
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.findAll(
            "div", {"class": re.compile(".*c-container.*")})
        container_list.extend(page_container_list)
        b = driver.find_element_by_xpath("//*[text()='下一页>']").click()

        time.sleep(2)

    #将每条URL进行一次跳转，得到初始URL，并添加进real_url_set中
    for container in container_list:
        href = container.find('h3').find('a').get('href')
        try:
            baidu_url = requests.get(url=href,
                                     headers=headers,
                                     allow_redirects=False)
        except:
            continue
        real_url = baidu_url.headers['Location']  #得到网页原始地址
        if real_url.startswith('http'):
            real_url_set.add(real_url + '\n')

    #必应搜索结果

    being_url = "https://cn.bing.com/search?q=" + key_word + "&FORM=PORE"
    try:
        driver.get(being_url)
    except:
        driver.refresh()
    #需要刷新一下界面
    time.sleep(2)
    driver.refresh()
    time.sleep(5)
    for page in range(1, MAX_PAGEs + 1):
        BF1 = BF(driver.page_source)
        #print(driver.page_source)
        page_container_list = BF1.find("ol", {"id": "b_results"}).findAll("h2")
        for page_container in page_container_list:
            try:
                real_url_set.add(page_container.find("a").get('href'))
            except:
                break
        b = driver.find_element_by_xpath(".//*[@title='下一页']").click()
        time.sleep(2)

    #谷歌暂时没做
    #google_url=""

    #eia
    for i in range(MAX_PAGEs):
        eia_url = "https://search.usa.gov/search?affiliate=eia.doe.gov&page={}&query={}}&utf8=%E2%9C%93".format(
            i, key_word)
        response = requests.get(url=eia_url,
                                headers=headers,
                                allow_redirects=False)
        BF1.BF(response.text)
        page_container_list = BF1.findAll(
            "div", {"class": "content-block-item result"})
        for page_container in page_container_list:
            href = page_container.find('a').get('href')
            real_url_set.add(href)
    return real_url_set