예제 #1
0
def bsp4_read(filename):
    html_doc = ''
    with open(filename, 'r', encoding='UTF-8') as fp:
        html_doc = fp.read()

    soup = BSP4(html_doc, 'lxml')
    return soup
예제 #2
0
def parse(response):
    """

    :param response: 通过requests.get(url)方法得到的response对象
    :return:
    """
    # 获取域名
    domain = response.url[:-1]

    # 得到html文本
    html_doc = response.content

    # 使用Beautiful Soup 解析文本 生成soup对象
    soup = BSP4(html_doc, "lxml")

    # 可以看到很多的soup对象的操作方法
    # print(dir(soup))

    # 按照box 进行解析出来todo 分析页面 其每一个专栏部分都是安排在一个tbox中间  所有tbox在上一级别种
    # todo 寻找id为 p_left的标签下面的所有的 class为tbox的标签 所以p_left前面需要加#  p_left 是个id的标签值
    tbox_list = soup.select("#p_left .tbox")

    print(type(tbox_list))  # <class 'list'>
    # 遍历tbox列表  进行相关操作
    for tbox in tbox_list:
        parse_tbox(tbox)
예제 #3
0
def parse_url():
    """
    真正从title_url网站中下载要下载得文本内容
    :param url:
    :return:
    """
    num = input(f"请输入你要爬取的名言序号(1-{title_index}):(输入0可以全部爬取)")
    motto_text = ''
    try:
        num = int(num)
        if num == 0:
            for url in g_url_set:
                response = requests.get(url)
                html_doc = response.content
                soup = BSP4(html_doc, 'lxml')
                motto_list = soup.select('.content p')

                for motto in motto_list:
                    motto_text += (motto.text + '\n')
                save_text(motto_text, num)
        elif num in range(1, title_index):
            url_list = list(g_url_set)
            url = url_list[num - 1]
            response = requests.get(url)
            html_doc = response.content
            soup = BSP4(html_doc, 'lxml')
            motto_list = soup.select('.content p')
            for motto in motto_list:
                motto_text += (motto.text + '\n')
            save_text(motto_text, num)

            inq = input("是否继续爬取:输入1继续,其他键退出")
            if inq == '1':
                parse_url()
            else:
                pass
        else:
            print('------------------------------')
            print(f'|  请入正确的数字!(0-{title_index})  |')
            print('------------------------------')
            parse_url()
    except ValueError:
        print('------------------------------')
        print(f'|  请入正确的数字!(0-{title_index})  |')
        print('------------------------------')
        parse_url()
예제 #4
0
def parse(response):
    """
    对下载得页面进行处理
    :param response:
    :return:
    """
    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    tbox_list = soup.select('.listbox dl')
    [parse_tbox(tbox) for tbox in tbox_list]
예제 #5
0
def parse_page(type, page, ctype, url):
    response = download(url, type, store_flag=False)
    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    link_list = soup.select('#p_left h2 a')
    index = 1
    for link in link_list:
        url_link = 'https://www.geyanw.com' + link['href']
        print(url_link)
        if url_link not in g_set:
            index += 1
            response = download(url_link,
                                type,
                                filename='%s_%s.html' % (ctype, index),
                                store_flag=True)
예제 #6
0
def fetch_kongjie(url):
    response = get_web_site(url)
    soup = BSP4(response.text, 'lxml')
    dl_list = soup.select(".oe_user_list dl")
    print(dl_list)
    for node in dl_list:
        detail_url = f"{DOMAIN}{node.dt.a.get('href')}"
        name = node.dd.h3.a.text
        detail_img = f"{DOMAIN}{node.dt.a.img.get('src')}"
        # print(f"name:{name}, detail_url:{detail_url}, detail_img:{detail_img}")
        print(f"{name}:下载完成!!")
        download_imgs(detail_img)
    # 获取下一页内容url
    next_node = soup.find_all(attrs={"title": "下一页"})[0]
    # print(f"type:{type(next_node)}, next_node:{next_node}, ")
    next_url = DOMAIN + next_node.get('href')
    fetch_kongjie(next_url)
예제 #7
0
def fetch_kongjie(url):
    response = get_web_site(url)
    html = response.content.decode('gbk')
    soup = BSP4(html, 'lxml')
    dl_list = soup.select(".listbox dl")
    # print(dl_list)
    for node in dl_list:
        lis = node.find_all('li')
        for li in lis:
            detail_url = f"{DOMAIN}{li.a.get('href')}"
            title = f"{li.a.get('title')}"
            print(f"{detail_url},标题:{title}")
        print('=' * 30)
    li_list = soup.select(".d4 li")
    for li in li_list:
        detail_url1 = f"{DOMAIN}{li.a.get('href')}"
        title1 = f"{li.a.get('title')}"
        print(f"{detail_url1},标题:{title1}")
예제 #8
0
def parse(response, type):
    url = response.url
    base_urls = url.split('/list_')
    domain = base_urls[0]
    init_html = base_urls[-1]
    ctype = init_html.split('_')[0]
    cindex = init_html.split('_')[1].split('.')[0]
    g_set.add(url)

    html_doc = response.content
    soup = BSP4(html_doc, 'lxml')
    page_list = soup.select('.pagelist li a')
    total_num = soup.select('.pagelist .pageinfo strong')[0].text
    page_max = int(total_num)
    [
        parse_page(type, page, ctype,
                   '%s/list_%s_%s.html' % (domain, ctype, page))
        for page in range(2, page_max + 1)
    ]