Python BSP4 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: bs4

클래스/타입: BSP4

hotexamples.com에서의 예제들: 8

Python BSP4 - 8개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 bs4.BSP4에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

BSP4(8)

자주 사용되는 메소드들

BSP4 (8)

예제 #1

파일 보기

def bsp4_read(filename):
    html_doc = ''
    with open(filename, 'r', encoding='UTF-8') as fp:
        html_doc = fp.read()

    soup = BSP4(html_doc, 'lxml')
    return soup

예제 #2

파일 보기

def parse(response):
    """

    :param response: 通过requests.get(url)方法得到的response对象
    :return:
    """
    # 获取域名
    domain = response.url[:-1]

    # 得到html文本
    html_doc = response.content

    # 使用Beautiful Soup 解析文本 生成soup对象
    soup = BSP4(html_doc, "lxml")

    # 可以看到很多的soup对象的操作方法
    # print(dir(soup))

    # 按照box 进行解析出来todo 分析页面 其每一个专栏部分都是安排在一个tbox中间  所有tbox在上一级别种
    # todo 寻找id为 p_left的标签下面的所有的 class为tbox的标签 所以p_left前面需要加#  p_left 是个id的标签值
    tbox_list = soup.select("#p_left .tbox")

    print(type(tbox_list))  # <class 'list'>
    # 遍历tbox列表  进行相关操作
    for tbox in tbox_list:
        parse_tbox(tbox)

예제 #3

파일 보기

파일: motto_spider.py 프로젝트: q535172947/fighting

def parse_url():
    """
    真正从title_url网站中下载要下载得文本内容
    :param url:
    :return:
    """
    num = input(f"请输入你要爬取的名言序号（1-{title_index}）：（输入0可以全部爬取）")
    motto_text = ''
    try:
        num = int(num)
        if num == 0:
            for url in g_url_set:
                response = requests.get(url)
                html_doc = response.content
                soup = BSP4(html_doc, 'lxml')
                motto_list = soup.select('.content p')

                for motto in motto_list:
                    motto_text += (motto.text + '\n')
                save_text(motto_text, num)
        elif num in range(1, title_index):
            url_list = list(g_url_set)
            url = url_list[num - 1]
            response = requests.get(url)
            html_doc = response.content
            soup = BSP4(html_doc, 'lxml')
            motto_list = soup.select('.content p')
            for motto in motto_list:
                motto_text += (motto.text + '\n')
            save_text(motto_text, num)

            inq = input("是否继续爬取：输入1继续，其他键退出")
            if inq == '1':
                parse_url()
            else:
                pass
        else:
            print('------------------------------')
            print(f'|  请入正确的数字！（0-{title_index}）  |')
            print('------------------------------')
            parse_url()
    except ValueError:
        print('------------------------------')
        print(f'|  请入正确的数字！（0-{title_index}）  |')
        print('------------------------------')
        parse_url()

예제 #4

파일 보기

파일: motto_spider.py 프로젝트: q535172947/fighting

def parse(response):
    """
    对下载得页面进行处理
    :param response:
    :return:
    """
    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    tbox_list = soup.select('.listbox dl')
    [parse_tbox(tbox) for tbox in tbox_list]

예제 #5

파일 보기

파일: geyan_v2.py 프로젝트: avine1003/spider

def parse_page(type, page, ctype, url):
    response = download(url, type, store_flag=False)
    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    link_list = soup.select('#p_left h2 a')
    index = 1
    for link in link_list:
        url_link = 'https://www.geyanw.com' + link['href']
        print(url_link)
        if url_link not in g_set:
            index += 1
            response = download(url_link,
                                type,
                                filename='%s_%s.html' % (ctype, index),
                                store_flag=True)

예제 #6

파일 보기

파일: kongjie.py 프로젝트: zhujixiang1997/1805_spider

def fetch_kongjie(url):
    response = get_web_site(url)
    soup = BSP4(response.text, 'lxml')
    dl_list = soup.select(".oe_user_list dl")
    print(dl_list)
    for node in dl_list:
        detail_url = f"{DOMAIN}{node.dt.a.get('href')}"
        name = node.dd.h3.a.text
        detail_img = f"{DOMAIN}{node.dt.a.img.get('src')}"
        # print(f"name:{name}, detail_url:{detail_url}, detail_img:{detail_img}")
        print(f"{name}：下载完成！！")
        download_imgs(detail_img)
    # 获取下一页内容url
    next_node = soup.find_all(attrs={"title": "下一页"})[0]
    # print(f"type:{type(next_node)}, next_node:{next_node}, ")
    next_url = DOMAIN + next_node.get('href')
    fetch_kongjie(next_url)

예제 #7

파일 보기

파일: motto_BSP4.py 프로젝트: zhujixiang1997/1805_spider

def fetch_kongjie(url):
    response = get_web_site(url)
    html = response.content.decode('gbk')
    soup = BSP4(html, 'lxml')
    dl_list = soup.select(".listbox dl")
    # print(dl_list)
    for node in dl_list:
        lis = node.find_all('li')
        for li in lis:
            detail_url = f"{DOMAIN}{li.a.get('href')}"
            title = f"{li.a.get('title')}"
            print(f"{detail_url},标题：{title}")
        print('=' * 30)
    li_list = soup.select(".d4 li")
    for li in li_list:
        detail_url1 = f"{DOMAIN}{li.a.get('href')}"
        title1 = f"{li.a.get('title')}"
        print(f"{detail_url1},标题：{title1}")

예제 #8

파일 보기

파일: geyan_v2.py 프로젝트: avine1003/spider

def parse(response, type):
    url = response.url
    base_urls = url.split('/list_')
    domain = base_urls[0]
    init_html = base_urls[-1]
    ctype = init_html.split('_')[0]
    cindex = init_html.split('_')[1].split('.')[0]
    g_set.add(url)

    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    page_list = soup.select('.pagelist li a')
    total_num = soup.select('.pagelist .pageinfo strong')[0].text
    page_max = int(total_num)
    [
        parse_page(type, page, ctype,
                   '%s/list_%s_%s.html' % (domain, ctype, page))
        for page in range(2, page_max + 1)
    ]