def bsp4_read(filename): html_doc = '' with open(filename, 'r', encoding='UTF-8') as fp: html_doc = fp.read() soup = BSP4(html_doc, 'lxml') return soup
def parse(response): """ :param response: 通过requests.get(url)方法得到的response对象 :return: """ # 获取域名 domain = response.url[:-1] # 得到html文本 html_doc = response.content # 使用Beautiful Soup 解析文本 生成soup对象 soup = BSP4(html_doc, "lxml") # 可以看到很多的soup对象的操作方法 # print(dir(soup)) # 按照box 进行解析出来todo 分析页面 其每一个专栏部分都是安排在一个tbox中间 所有tbox在上一级别种 # todo 寻找id为 p_left的标签下面的所有的 class为tbox的标签 所以p_left前面需要加# p_left 是个id的标签值 tbox_list = soup.select("#p_left .tbox") print(type(tbox_list)) # <class 'list'> # 遍历tbox列表 进行相关操作 for tbox in tbox_list: parse_tbox(tbox)
def parse_url(): """ 真正从title_url网站中下载要下载得文本内容 :param url: :return: """ num = input(f"请输入你要爬取的名言序号(1-{title_index}):(输入0可以全部爬取)") motto_text = '' try: num = int(num) if num == 0: for url in g_url_set: response = requests.get(url) html_doc = response.content soup = BSP4(html_doc, 'lxml') motto_list = soup.select('.content p') for motto in motto_list: motto_text += (motto.text + '\n') save_text(motto_text, num) elif num in range(1, title_index): url_list = list(g_url_set) url = url_list[num - 1] response = requests.get(url) html_doc = response.content soup = BSP4(html_doc, 'lxml') motto_list = soup.select('.content p') for motto in motto_list: motto_text += (motto.text + '\n') save_text(motto_text, num) inq = input("是否继续爬取:输入1继续,其他键退出") if inq == '1': parse_url() else: pass else: print('------------------------------') print(f'| 请入正确的数字!(0-{title_index}) |') print('------------------------------') parse_url() except ValueError: print('------------------------------') print(f'| 请入正确的数字!(0-{title_index}) |') print('------------------------------') parse_url()
def parse(response): """ 对下载得页面进行处理 :param response: :return: """ html_doc = response.content soup = BSP4(html_doc, 'lxml') tbox_list = soup.select('.listbox dl') [parse_tbox(tbox) for tbox in tbox_list]
def parse_page(type, page, ctype, url): response = download(url, type, store_flag=False) html_doc = response.content soup = BSP4(html_doc, 'lxml') link_list = soup.select('#p_left h2 a') index = 1 for link in link_list: url_link = 'https://www.geyanw.com' + link['href'] print(url_link) if url_link not in g_set: index += 1 response = download(url_link, type, filename='%s_%s.html' % (ctype, index), store_flag=True)
def fetch_kongjie(url): response = get_web_site(url) soup = BSP4(response.text, 'lxml') dl_list = soup.select(".oe_user_list dl") print(dl_list) for node in dl_list: detail_url = f"{DOMAIN}{node.dt.a.get('href')}" name = node.dd.h3.a.text detail_img = f"{DOMAIN}{node.dt.a.img.get('src')}" # print(f"name:{name}, detail_url:{detail_url}, detail_img:{detail_img}") print(f"{name}:下载完成!!") download_imgs(detail_img) # 获取下一页内容url next_node = soup.find_all(attrs={"title": "下一页"})[0] # print(f"type:{type(next_node)}, next_node:{next_node}, ") next_url = DOMAIN + next_node.get('href') fetch_kongjie(next_url)
def fetch_kongjie(url): response = get_web_site(url) html = response.content.decode('gbk') soup = BSP4(html, 'lxml') dl_list = soup.select(".listbox dl") # print(dl_list) for node in dl_list: lis = node.find_all('li') for li in lis: detail_url = f"{DOMAIN}{li.a.get('href')}" title = f"{li.a.get('title')}" print(f"{detail_url},标题:{title}") print('=' * 30) li_list = soup.select(".d4 li") for li in li_list: detail_url1 = f"{DOMAIN}{li.a.get('href')}" title1 = f"{li.a.get('title')}" print(f"{detail_url1},标题:{title1}")
def parse(response, type): url = response.url base_urls = url.split('/list_') domain = base_urls[0] init_html = base_urls[-1] ctype = init_html.split('_')[0] cindex = init_html.split('_')[1].split('.')[0] g_set.add(url) html_doc = response.content soup = BSP4(html_doc, 'lxml') page_list = soup.select('.pagelist li a') total_num = soup.select('.pagelist .pageinfo strong')[0].text page_max = int(total_num) [ parse_page(type, page, ctype, '%s/list_%s_%s.html' % (domain, ctype, page)) for page in range(2, page_max + 1) ]