Exemple #1
0
hostname = sys.argv[0]

if not hostname:
    print('not fond hostname')
    exit()

hostname = hostname[0:-3]


# url = 'http://{}'.format(hostname)
url_base = 'http://{}'.format(hostname)
used_urls = set()

num = 0

dbs_tools = DBSReptileTools()
def download_html(url):
    global num
    num += 1

    # 判断当前url是否已经使用过,如果使用过则直接返回
    if url in used_urls:
        # num -= 1
        return
    else:
        if url_base != url:
            used_urls.add(url)
    # 获取页面内容
    try:
        html = dbs_tools.get_html(page_addr=url)
        if check_is_save(url):
import time
import glob
import os


# TODO 基础变量定义
read_folder_path = 'search_tag'
save_folder_path = 'result'
url_base = 'https://baike.yongzin.com'

# TODO 计数变量定义
step = 0
num_one = step
num_two = 0

dbs_tools = DBSReptileTools()
# TODO 获取search页面所有文件路径
file_path_list = dbs_tools.search_file_list(read_folder_path, search_file_rule='*.html')
# TODO 循环所有的路径并拿到所有页面中的URL
for file_path in file_path_list:
    num_one += 1
    # TODO 获取已下载好的html内容
    html_search = dbs_tools.read_file_content(file_path)
    # TODO 分析并获取url
    dom = etree.HTML(html_search)
    page_addr_list = dom.xpath('//h4[@class="c-title"]/a/@href')
    # TODO 循环下载wiki info 页面并保存到result中
    for page_addr in page_addr_list[step:]:
        # TODO 要抓取的url,要保存的文件路径。整理
        page_addr = '{}{}'.format(url_base, page_addr)
        # task_id = dbs_tools.get_task_id(page_addr=page_addr)
Exemple #3
0
from lxml import etree
import time
import glob
import os

# TODO 基础变量定义
read_folder_path = 'tb.xzxw.com'
save_folder_path = 'tb.xzxw.com_result'
url_base = 'https://baike.yongzin.com'

# TODO 计数变量定义
step = 0
num_one = step
num_two = 0

dbs_tools = DBSReptileTools(timeout=2)
# TODO 获取search页面所有文件路径
file_path_list = dbs_tools.search_file_list(read_folder_path,
                                            search_file_rule='*.webloc')
# TODO 循环所有的路径并拿到所有页面中的URL
for file_path in file_path_list:
    num_one += 1
    page_addr = dbs_tools.get_webloc_page_addr(read_file_path=file_path)
    print(num_one, page_addr)
    task_id = dbs_tools.get_task_id(page_addr=page_addr)
    html_path = '{}/{}.html'.format(save_folder_path, task_id)
    webloc_path = '{}/{}.webloc'.format(save_folder_path, task_id)
    html = dbs_tools.get_html(page_addr=page_addr)
    if html == '':
        continue
    dbs_tools.save_file_w(html_path, html)
Exemple #4
0
from DBSReptileTools import DBSReptileTools
from lxml import etree
import os

read_path_base = 'search_tag'
result_path_base = 'result'

save_path_base = 'new_url.txt'

url_base = 'https://baike.yongzin.com'

dbs_tools = DBSReptileTools()

file_path_list = dbs_tools.search_file_list(read_path_base, '*.html')
num_one = 0
num_two = 0
for file_path in file_path_list:
    num_one += 1
    html = dbs_tools.read_file_content(file_path)
    dom = etree.HTML(html)
    hrefs = dom.xpath('//h4[@class="c-title"]/a/@href')
    for href in hrefs:
        num_two += 1
        page_addr = '{}{}'.format(url_base, href)
        task_id = dbs_tools.get_task_id(page_addr)
        file_name = '{}.html'.format(task_id)
        check_path = os.path.join(result_path_base, file_name)
        # print(check_path)
        # break
        if os.path.exists(check_path):
            print(num_one, num_two, page_addr, '已下载')
Exemple #5
0
from DBSReptileTools import DBSReptileTools
from lxml import etree
import os
import glob

url_base = 'https://baike.yongzin.com/'
save_path_base = 'wiki_tag'
# TODO: 获取result中所有.html文件路径
dbs_tools = DBSReptileTools()
file_path_list = dbs_tools.search_file_list('result', '*.html')
# TODO: 遍历所有file_path_list将tag获取到并保存到文件夹
step = 0
num = 0
num_one = step
for file_path in file_path_list[step:]:
    num_one += 1
    html = dbs_tools.read_file_content(file_path)
    dom = etree.HTML(html)
    dom_a_href_list = dom.xpath('//a[@class="btn btn-white mr10 mt10"]/@href')
    for a_href in dom_a_href_list:
        num += 1
        print(num_one, num, len(glob.glob('wiki_tag/*.webloc')), file_path)
        page_addr = a_href.replace('../', url_base)
        task_id = dbs_tools.get_task_id(page_addr=page_addr)
        file_name = '{}.webloc'.format(task_id)
        save_path = os.path.join(save_path_base, file_name)
        if os.path.exists(save_path):
            continue
        webloc_xml = dbs_tools.get_webloc_content(page_addr=page_addr)
        dbs_tools.save_file_w(save_path, webloc_xml)
Exemple #6
0
from DBSReptileTools import DBSReptileTools
from lxml import etree
import math
import time
import glob
import os

webloc_path = 'wiki_tag'
save_path_base = 'search_tag'


dbs_tools = DBSReptileTools()

step = 125
num_one = step
num_two = 0

# TODO 获取所有webloc文件路径
webloc_file_list = dbs_tools.search_file_list(webloc_path, search_file_rule='*.webloc')
# TODO 遍历所有文件获取文件中的 url 并将搜索页面保存起来 
for webloc_file in webloc_file_list[step:]:
    num_one += 1
    # 获取基础 url
    url_base = dbs_tools.get_webloc_page_addr(read_file_path=webloc_file)
    # 获取当前搜索标签的总页数
    html = dbs_tools.get_html(page_addr=url_base)
    dom = etree.HTML(html)
    total_size_str = dom.xpath('/html/body/section/div/div/div/div[1]/em/text()')
    if not total_size_str:
        raise Exception('search page not total_size')
    total_size = int(total_size_str[0].replace(',', ''))