hostname = sys.argv[0] if not hostname: print('not fond hostname') exit() hostname = hostname[0:-3] # url = 'http://{}'.format(hostname) url_base = 'http://{}'.format(hostname) used_urls = set() num = 0 dbs_tools = DBSReptileTools() def download_html(url): global num num += 1 # 判断当前url是否已经使用过,如果使用过则直接返回 if url in used_urls: # num -= 1 return else: if url_base != url: used_urls.add(url) # 获取页面内容 try: html = dbs_tools.get_html(page_addr=url) if check_is_save(url):
import time import glob import os # TODO 基础变量定义 read_folder_path = 'search_tag' save_folder_path = 'result' url_base = 'https://baike.yongzin.com' # TODO 计数变量定义 step = 0 num_one = step num_two = 0 dbs_tools = DBSReptileTools() # TODO 获取search页面所有文件路径 file_path_list = dbs_tools.search_file_list(read_folder_path, search_file_rule='*.html') # TODO 循环所有的路径并拿到所有页面中的URL for file_path in file_path_list: num_one += 1 # TODO 获取已下载好的html内容 html_search = dbs_tools.read_file_content(file_path) # TODO 分析并获取url dom = etree.HTML(html_search) page_addr_list = dom.xpath('//h4[@class="c-title"]/a/@href') # TODO 循环下载wiki info 页面并保存到result中 for page_addr in page_addr_list[step:]: # TODO 要抓取的url,要保存的文件路径。整理 page_addr = '{}{}'.format(url_base, page_addr) # task_id = dbs_tools.get_task_id(page_addr=page_addr)
from lxml import etree import time import glob import os # TODO 基础变量定义 read_folder_path = 'tb.xzxw.com' save_folder_path = 'tb.xzxw.com_result' url_base = 'https://baike.yongzin.com' # TODO 计数变量定义 step = 0 num_one = step num_two = 0 dbs_tools = DBSReptileTools(timeout=2) # TODO 获取search页面所有文件路径 file_path_list = dbs_tools.search_file_list(read_folder_path, search_file_rule='*.webloc') # TODO 循环所有的路径并拿到所有页面中的URL for file_path in file_path_list: num_one += 1 page_addr = dbs_tools.get_webloc_page_addr(read_file_path=file_path) print(num_one, page_addr) task_id = dbs_tools.get_task_id(page_addr=page_addr) html_path = '{}/{}.html'.format(save_folder_path, task_id) webloc_path = '{}/{}.webloc'.format(save_folder_path, task_id) html = dbs_tools.get_html(page_addr=page_addr) if html == '': continue dbs_tools.save_file_w(html_path, html)
from DBSReptileTools import DBSReptileTools from lxml import etree import os read_path_base = 'search_tag' result_path_base = 'result' save_path_base = 'new_url.txt' url_base = 'https://baike.yongzin.com' dbs_tools = DBSReptileTools() file_path_list = dbs_tools.search_file_list(read_path_base, '*.html') num_one = 0 num_two = 0 for file_path in file_path_list: num_one += 1 html = dbs_tools.read_file_content(file_path) dom = etree.HTML(html) hrefs = dom.xpath('//h4[@class="c-title"]/a/@href') for href in hrefs: num_two += 1 page_addr = '{}{}'.format(url_base, href) task_id = dbs_tools.get_task_id(page_addr) file_name = '{}.html'.format(task_id) check_path = os.path.join(result_path_base, file_name) # print(check_path) # break if os.path.exists(check_path): print(num_one, num_two, page_addr, '已下载')
from DBSReptileTools import DBSReptileTools from lxml import etree import os import glob url_base = 'https://baike.yongzin.com/' save_path_base = 'wiki_tag' # TODO: 获取result中所有.html文件路径 dbs_tools = DBSReptileTools() file_path_list = dbs_tools.search_file_list('result', '*.html') # TODO: 遍历所有file_path_list将tag获取到并保存到文件夹 step = 0 num = 0 num_one = step for file_path in file_path_list[step:]: num_one += 1 html = dbs_tools.read_file_content(file_path) dom = etree.HTML(html) dom_a_href_list = dom.xpath('//a[@class="btn btn-white mr10 mt10"]/@href') for a_href in dom_a_href_list: num += 1 print(num_one, num, len(glob.glob('wiki_tag/*.webloc')), file_path) page_addr = a_href.replace('../', url_base) task_id = dbs_tools.get_task_id(page_addr=page_addr) file_name = '{}.webloc'.format(task_id) save_path = os.path.join(save_path_base, file_name) if os.path.exists(save_path): continue webloc_xml = dbs_tools.get_webloc_content(page_addr=page_addr) dbs_tools.save_file_w(save_path, webloc_xml)
from DBSReptileTools import DBSReptileTools from lxml import etree import math import time import glob import os webloc_path = 'wiki_tag' save_path_base = 'search_tag' dbs_tools = DBSReptileTools() step = 125 num_one = step num_two = 0 # TODO 获取所有webloc文件路径 webloc_file_list = dbs_tools.search_file_list(webloc_path, search_file_rule='*.webloc') # TODO 遍历所有文件获取文件中的 url 并将搜索页面保存起来 for webloc_file in webloc_file_list[step:]: num_one += 1 # 获取基础 url url_base = dbs_tools.get_webloc_page_addr(read_file_path=webloc_file) # 获取当前搜索标签的总页数 html = dbs_tools.get_html(page_addr=url_base) dom = etree.HTML(html) total_size_str = dom.xpath('/html/body/section/div/div/div/div[1]/em/text()') if not total_size_str: raise Exception('search page not total_size') total_size = int(total_size_str[0].replace(',', ''))